diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..0cd58331b2a989b68be4ec5676383437fca8687b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,36 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8a0162bfc076e35c9b4d87579f05f86ff2639a43
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.venv
+__pycache__
+.bak
+megablocks-moe/.bak
+.pytest_cache
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2bed67f0bc24fc62546154442dad44b08d71d39c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,76 @@
+---
+license: apache-2.0
+tags:
+  - kernels
+---
+
+## Quickstart
+
+```bash
+uv run https://huggingface.co/kernels-community/megablocks/raw/main/readme_example.py
+```
+
+```python
+# /// script
+# requires-python = "==3.10"
+# dependencies = [
+#     "numpy",
+#     "kernels",
+#     "torch"
+# ]
+# ///
+
+import torch
+from collections import namedtuple
+
+from kernels import get_kernel
+
+# Make reproducible
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+
+# Download optimized kernels from the Hugging Face hub
+megablocks = get_kernel("kernels-community/megablocks")
+print("MegaBlocks kernel downloaded successfully.")
+
+model = megablocks.layers.MegaBlocksMoeMLP()
+model.experts = namedtuple("Experts", ["gate_up_proj", "gate_down_proj", "down_proj", "hidden_size"])
+print("MegaBlocksMoeMLP instance created successfully.")
+
+# Config
+ne, hs, isz = 128, 1152, 3072
+
+# Router with proper initialization
+model.router = torch.nn.Linear(hs, ne, device="cuda")
+torch.nn.init.kaiming_uniform_(model.router.weight)
+
+# Expert layers with realistic weights
+e = model.experts
+e.gate_up_proj = torch.nn.Parameter(torch.randn(ne, hs, isz, device="cuda") * 0.02)
+e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+e.down_proj = torch.nn.Parameter(torch.randn(ne, 1536, hs, device="cuda") * 0.02)
+e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+e.hidden_size = hs
+print("Expert layers initialized successfully.")
+
+# Test with normalized input
+x = torch.randn(1, 1, hs, device="cuda") * 0.1
+output, expert_weights = model(x)
+print("Model forward pass completed successfully.")
+
+print(f"Output shape: {output.shape}")
+print(f"Output range: [{output.min():.3f}, {output.max():.3f}]")
+print(f"Output: {output.flatten()[:10]}")
+print(f"Expert weights sum: {expert_weights.sum():.3f}")
+```
+
+### Performance
+
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_animation.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_animation.svg" />
+
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_latency.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_latency.svg" />
+
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_throughput.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_throughput.svg" />
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..780d585a4068ff525a8cd071e64cdc1a4d39c988
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,233 @@
+import torch
+import torch.nn.functional as F
+from collections import namedtuple
+
+from kernels.benchmark import Benchmark
+
+
+def moe_mlp_reference(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    gate_up_proj: torch.Tensor,
+    gate_up_proj_bias: torch.Tensor,
+    down_proj: torch.Tensor,
+    down_proj_bias: torch.Tensor,
+    top_k: int = 4,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    in_shape = x.shape
+    num_experts = router_weight.shape[0]
+    hidden_size = x.shape[-1]
+
+    # Flatten to (num_tokens, hidden_size)
+    hidden_states = x.view(-1, hidden_size)
+    num_tokens = hidden_states.shape[0]
+
+    # Router: compute logits and get top-k experts
+    logits = F.linear(hidden_states, router_weight, router_bias)
+    expert_weights, router_indices = torch.topk(logits, top_k, dim=-1)
+    routing_weights = F.softmax(expert_weights, dim=-1)
+
+    # Initialize output
+    next_states = torch.zeros_like(hidden_states)
+
+    # Create expert mask using one_hot
+    with torch.no_grad():
+        expert_mask = F.one_hot(router_indices, num_classes=num_experts)
+        expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, num_tokens)
+        # Find which experts are hit
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+
+    # Process each expert that has tokens
+    for expert_idx in expert_hit:
+        expert_idx = expert_idx[0]
+        with torch.no_grad():
+            top_k_idx, token_idx = torch.where(expert_mask[expert_idx])
+
+        current_state = hidden_states[token_idx]
+
+        # Up projection
+        gate_up = (
+            current_state @ gate_up_proj[expert_idx] + gate_up_proj_bias[expert_idx]
+        )
+
+        # Split into gate and up
+        gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+
+        # Clamp
+        gate = gate.clamp(min=None, max=limit)
+        up = up.clamp(min=-limit, max=limit)
+
+        # SwiGLU-like activation
+        glu = gate * torch.sigmoid(gate * alpha)
+        gated_output = (up + 1) * glu
+
+        # Down projection
+        out = gated_output @ down_proj[expert_idx] + down_proj_bias[expert_idx]
+
+        # Get the routing weight for this expert at the correct top_k position
+        weights_for_expert = routing_weights[token_idx, top_k_idx]
+        weighted_output = out * weights_for_expert[:, None]
+        next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+
+    return next_states.view(in_shape), routing_weights
+
+
+class MegaBlocksMoeBenchmark(Benchmark):
+    seed: int = 42
+
+    def setup(self):
+        # Config matching readme_example.py
+        ne, hs, isz = 128, 1152, 3072
+        batch, seq = 8, 1
+
+        # Router
+        self.router_weight = torch.randn(
+            ne, hs, device=self.device, dtype=torch.float32
+        )
+        torch.nn.init.kaiming_uniform_(self.router_weight)
+        self.router_bias = torch.zeros(ne, device=self.device, dtype=torch.float32)
+
+        # Expert weights
+        self.gate_up_proj = (
+            torch.randn(ne, hs, isz, device=self.device, dtype=torch.float32) * 0.02
+        )
+        self.gate_up_proj_bias = torch.zeros(
+            ne, isz, device=self.device, dtype=torch.float32
+        )
+        self.down_proj = (
+            torch.randn(ne, isz // 2, hs, device=self.device, dtype=torch.float32)
+            * 0.02
+        )
+        self.down_proj_bias = torch.zeros(
+            ne, hs, device=self.device, dtype=torch.float32
+        )
+
+        # Input
+        self.x = (
+            torch.randn(seq, batch, hs, device=self.device, dtype=torch.float32) * 0.1
+        )
+
+        # Setup the model
+        self.model = self.kernel.layers.MegaBlocksMoeMLP()
+        self.model.router = torch.nn.Linear(hs, ne, device=self.device)
+        self.model.router.weight.data = self.router_weight.clone()
+        self.model.router.bias.data = self.router_bias.clone()
+
+        Experts = namedtuple(
+            "Experts",
+            [
+                "gate_up_proj",
+                "gate_up_proj_bias",
+                "down_proj",
+                "down_proj_bias",
+                "hidden_size",
+                "num_experts",
+            ],
+        )
+        self.model.experts = Experts(
+            gate_up_proj=torch.nn.Parameter(self.gate_up_proj.clone()),
+            gate_up_proj_bias=torch.nn.Parameter(self.gate_up_proj_bias.clone()),
+            down_proj=torch.nn.Parameter(self.down_proj.clone()),
+            down_proj_bias=torch.nn.Parameter(self.down_proj_bias.clone()),
+            hidden_size=hs,
+            num_experts=ne,
+        )
+
+        self.out = torch.empty(seq, batch, hs, device=self.device, dtype=torch.float32)
+
+    def benchmark_base(self):
+        self.out, self.expert_weights = self.model(self.x)
+
+    def verify_base(self) -> torch.Tensor:
+        ref_out, _ = moe_mlp_reference(
+            self.x,
+            self.router_weight,
+            self.router_bias,
+            self.gate_up_proj,
+            self.gate_up_proj_bias,
+            self.down_proj,
+            self.down_proj_bias,
+            top_k=4,
+        )
+        return ref_out
+
+    def setup_large(self):
+        # Larger config with more tokens
+        ne, hs, isz = 128, 1152, 3072
+        batch, seq = 32, 16
+
+        # Router
+        self.router_weight = torch.randn(
+            ne, hs, device=self.device, dtype=torch.float32
+        )
+        torch.nn.init.kaiming_uniform_(self.router_weight)
+        self.router_bias = torch.zeros(ne, device=self.device, dtype=torch.float32)
+
+        # Expert weights
+        self.gate_up_proj = (
+            torch.randn(ne, hs, isz, device=self.device, dtype=torch.float32) * 0.02
+        )
+        self.gate_up_proj_bias = torch.zeros(
+            ne, isz, device=self.device, dtype=torch.float32
+        )
+        self.down_proj = (
+            torch.randn(ne, isz // 2, hs, device=self.device, dtype=torch.float32)
+            * 0.02
+        )
+        self.down_proj_bias = torch.zeros(
+            ne, hs, device=self.device, dtype=torch.float32
+        )
+
+        # Input
+        self.x = (
+            torch.randn(seq, batch, hs, device=self.device, dtype=torch.float32) * 0.1
+        )
+
+        # Setup the model
+        self.model = self.kernel.layers.MegaBlocksMoeMLP()
+        self.model.router = torch.nn.Linear(hs, ne, device=self.device)
+        self.model.router.weight.data = self.router_weight.clone()
+        self.model.router.bias.data = self.router_bias.clone()
+
+        Experts = namedtuple(
+            "Experts",
+            [
+                "gate_up_proj",
+                "gate_up_proj_bias",
+                "down_proj",
+                "down_proj_bias",
+                "hidden_size",
+                "num_experts",
+                "capacity_factor",
+            ],
+        )
+        self.model.experts = Experts(
+            gate_up_proj=torch.nn.Parameter(self.gate_up_proj.clone()),
+            gate_up_proj_bias=torch.nn.Parameter(self.gate_up_proj_bias.clone()),
+            down_proj=torch.nn.Parameter(self.down_proj.clone()),
+            down_proj_bias=torch.nn.Parameter(self.down_proj_bias.clone()),
+            hidden_size=hs,
+            num_experts=ne,
+            capacity_factor=4.0,  # Higher capacity to avoid token dropping
+        )
+
+        self.out = torch.empty(seq, batch, hs, device=self.device, dtype=torch.float32)
+
+    def benchmark_large(self):
+        self.out, self.expert_weights = self.model(self.x)
+
+    def verify_large(self) -> torch.Tensor:
+        ref_out, _ = moe_mlp_reference(
+            self.x,
+            self.router_weight,
+            self.router_bias,
+            self.gate_up_proj,
+            self.gate_up_proj_bias,
+            self.down_proj,
+            self.down_proj_bias,
+            top_k=4,
+        )
+        return ref_out
diff --git a/build.toml b/build.toml
new file mode 100644
index 0000000000000000000000000000000000000000..863ecb6568219c7bb42b06cc0a81445a7639fc8a
--- /dev/null
+++ b/build.toml
@@ -0,0 +1,43 @@
+[general]
+name = "megablocks"
+universal = false
+
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+
+[kernel.megablocks]
+backend = "cuda"
+cuda-capabilities = [
+    "7.0",
+    "7.2",
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "11.8",
+    "12.0"
+]
+depends = ["torch", "cutlass_3_8"]
+src = [
+    "csrc/new_cumsum.h",
+    "csrc/new_cumsum.cu",
+    "csrc/new_histogram.h",
+    "csrc/new_histogram.cu",
+    "csrc/new_indices.h",
+    "csrc/new_indices.cu",
+    "csrc/new_replicate.cu",
+    "csrc/new_replicate.h",
+    "csrc/new_sort.h",
+    "csrc/new_sort.cu",
+    # vendored grouped gemm
+    "csrc/grouped_gemm/fill_arguments.cuh",
+    "csrc/grouped_gemm/grouped_gemm.cu",
+    "csrc/grouped_gemm/grouped_gemm.h",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/arguments.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/common.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmoe.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/gelu.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/glu.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/memory_test.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/mlp.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/moe.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/mpu.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/router.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so b/build/torch210-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..5ba8ac355e5d280728ad5f5585983fc4627eb4ad
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10f9bb557d5036d3b215b4dabcb0c44a51d276a6a3ab67c37c37dfca3259f824
+size 2219080
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..102573e975ffc7897e0e9c4edca028ed1dc67419
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cpu_7a6bcf4
+ops = torch.ops._megablocks_cpu_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cpu_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/_version.py b/build/torch210-cxx11-cpu-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/backend/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/backend/kernels.py b/build/torch210-cxx11-cpu-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/benchmark_util.py b/build/torch210-cxx11-cpu-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/layers.py b/build/torch210-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/metadata.json b/build/torch210-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb22148b3f551be150f7824a5684c19bbc40ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_gather.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/cumsum.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/gather.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_gather.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/repeat.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/replicate.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/round_up.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/scatter.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/sort.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/sum.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/ops/topology.py b/build/torch210-cxx11-cpu-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/matrix.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/random/__init__.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cpu-x86_64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/arguments.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/common.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/gelu.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/glu.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/mlp.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/moe.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/mpu.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/router.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..aaae1f716dedb9f0e6616ac3fe9ad730ed68f7f6
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39609d4551be5cbaf91e53da23cf4826040984b94aa8c5a574e69de104b484bd
+size 15124328
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_version.py b/build/torch210-cxx11-cu126-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/backend/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/backend/kernels.py b/build/torch210-cxx11-cu126-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/benchmark_util.py b/build/torch210-cxx11-cu126-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layers.py b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/cumsum.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/gather.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/repeat.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/replicate.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/round_up.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/scatter.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/sort.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/sum.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/ops/topology.py b/build/torch210-cxx11-cu126-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/matrix.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/arguments.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/common.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/gelu.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/glu.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/mlp.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/moe.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/mpu.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/router.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..4e4162acffa15b572b47e28662e3ea8dc8259eee
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7949dd996d24b131bc32dd98e15b6bf00c5f4cad2f17cd96edec5f5ae90544de
+size 15061056
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_version.py b/build/torch210-cxx11-cu126-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/backend/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/backend/kernels.py b/build/torch210-cxx11-cu126-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/benchmark_util.py b/build/torch210-cxx11-cu126-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/cumsum.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/gather.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/repeat.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/replicate.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/round_up.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/scatter.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/sort.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/sum.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/ops/topology.py b/build/torch210-cxx11-cu126-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/matrix.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/arguments.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/common.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/gelu.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/glu.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/mlp.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/moe.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/mpu.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/router.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7d9953915c47274e401eeab2f0332618b25769ed
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f60e88338e68c1def0050d88229517cf28b83852da6b4a3fff41e73331eca0
+size 21088232
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_version.py b/build/torch210-cxx11-cu128-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/backend/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/backend/kernels.py b/build/torch210-cxx11-cu128-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/benchmark_util.py b/build/torch210-cxx11-cu128-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layers.py b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/cumsum.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/gather.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/repeat.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/replicate.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/round_up.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/scatter.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/sort.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/sum.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/ops/topology.py b/build/torch210-cxx11-cu128-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/matrix.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/arguments.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/common.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/gelu.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/glu.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/mlp.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/moe.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/mpu.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/router.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b730d142f5ea59b45f3e5a9f0e347dbd8b7b589f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e977b806a1a10968921e0bae84919664195cdc7baf05c08bf9ee63e4daa752
+size 21009984
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_version.py b/build/torch210-cxx11-cu128-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/backend/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/backend/kernels.py b/build/torch210-cxx11-cu128-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/benchmark_util.py b/build/torch210-cxx11-cu128-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/cumsum.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/gather.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/repeat.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/replicate.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/round_up.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/scatter.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/sort.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/sum.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/ops/topology.py b/build/torch210-cxx11-cu128-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/matrix.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/arguments.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/common.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/gelu.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/glu.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/mlp.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/moe.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/mpu.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/router.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3aace39adc87c5eb54f3e3b57f2d05bf12eb3eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a97da44105e24c5ce37c013d124fa87ddb71aa465a5278304f87f426bafd575
+size 12073200
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_version.py b/build/torch210-cxx11-cu130-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/backend/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/backend/kernels.py b/build/torch210-cxx11-cu130-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/benchmark_util.py b/build/torch210-cxx11-cu130-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layers.py b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/cumsum.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/gather.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/repeat.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/replicate.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/round_up.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/scatter.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/sort.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/sum.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/ops/topology.py b/build/torch210-cxx11-cu130-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/matrix.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/activation_fn.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/all_to_all.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/arguments.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/common.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmoe.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/gelu.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/glu.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/memory_test.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/mlp.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/moe.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/mpu.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/router.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ae40d277669f147909c87b24ab352118b0c55653
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f4b9db1caad794b2dfa9befd5d7225e1a0a78bd891f82bb1d1d84c46143ddf
+size 12041592
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_version.py b/build/torch210-cxx11-cu130-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/backend/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/backend/kernels.py b/build/torch210-cxx11-cu130-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/benchmark_util.py b/build/torch210-cxx11-cu130-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/cpu_fused_moe.py b/build/torch210-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm_util.py b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_gather.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_scatter.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/cumsum.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/gather.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_gather.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/repeat.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/replicate.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/round_up.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/scatter.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/sort.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/stk_autocast.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/sum.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/ops/topology.py b/build/torch210-cxx11-cu130-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/autocast.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/matrix.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/random/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/xpu_fused_moe.py b/build/torch210-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/arguments.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/common.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/gelu.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/glu.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mlp.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/moe.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mpu.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/router.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so b/build/torch210-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9ea39ea7a409dce7a5431c24e20fe754bbd42787
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b22948998af13e0b921419366b6f68a1dd0e649e7ccb8c55c123c1aa9f3ec5b
+size 5381760
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8dd6eeccd632df5e23111e5dd5221d3e1fcb47
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_xpu_7a6bcf4
+ops = torch.ops._megablocks_xpu_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_xpu_7a6bcf4::{op_name}"
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_version.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/backend/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/backend/kernels.py b/build/torch210-cxx11-xpu20253-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/benchmark_util.py b/build/torch210-cxx11-xpu20253-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py b/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py b/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py b/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/cumsum.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/gather.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/repeat.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/replicate.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/round_up.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/scatter.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sum.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/ops/topology.py b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/matrix.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py b/build/torch210-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/arguments.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/common.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmoe.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/gelu.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/glu.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/memory_test.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/mlp.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/moe.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/mpu.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/router.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so b/build/torch211-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..4fe96907856dba6b076db26ec4f8522939171a26
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_megablocks_cpu_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fe559f32ea12ed42966ea79e77aa2ea0c7bfb5e123e84ac526fc5d94cf6b9a3
+size 2219080
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..102573e975ffc7897e0e9c4edca028ed1dc67419
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cpu_7a6bcf4
+ops = torch.ops._megablocks_cpu_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cpu_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/_version.py b/build/torch211-cxx11-cpu-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/backend/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/backend/kernels.py b/build/torch211-cxx11-cpu-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/benchmark_util.py b/build/torch211-cxx11-cpu-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/layers.py b/build/torch211-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/megablocks/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/metadata.json b/build/torch211-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb22148b3f551be150f7824a5684c19bbc40ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_gather.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/cumsum.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/gather.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_gather.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/repeat.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/replicate.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/round_up.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/scatter.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/sort.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/sum.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/ops/topology.py b/build/torch211-cxx11-cpu-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/matrix.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/random/__init__.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cpu-x86_64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/arguments.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/common.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/gelu.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/glu.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/mlp.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/moe.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/mpu.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/router.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d122a7970e66f05517764a5bbe5efd723671111a
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bccbfca5181c0702db62b8285d63aa8c380902bf70555369e1a7b6c979009a01
+size 15124328
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_version.py b/build/torch211-cxx11-cu126-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/backend/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/backend/kernels.py b/build/torch211-cxx11-cu126-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/benchmark_util.py b/build/torch211-cxx11-cu126-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layers.py b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/cumsum.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/gather.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/repeat.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/replicate.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/round_up.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/scatter.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/sort.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/sum.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/ops/topology.py b/build/torch211-cxx11-cu126-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/matrix.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/arguments.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/common.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/gelu.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/glu.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/mlp.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/moe.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/mpu.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/router.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..4850e7984e1967316cf9646f8d1f1869af56e094
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3d80af1e9bea8f67e6377db54e0b11f61bda494c42b5f5612a9f93eebc5ef55
+size 15061056
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_version.py b/build/torch211-cxx11-cu126-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/backend/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/backend/kernels.py b/build/torch211-cxx11-cu126-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/benchmark_util.py b/build/torch211-cxx11-cu126-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layers.py b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/cumsum.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/gather.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/repeat.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/replicate.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/round_up.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/scatter.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/sort.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/sum.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/ops/topology.py b/build/torch211-cxx11-cu126-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/matrix.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/arguments.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/common.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/gelu.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/glu.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/mlp.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/moe.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/mpu.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/router.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fea7e4a177f87345182ed40bc6ffd6dd007b6ca5
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91f8b706af9af0569af55e732d0f508af6a31c3bff9268dc3cbd24193c5fee0c
+size 21088232
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_version.py b/build/torch211-cxx11-cu128-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/backend/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/backend/kernels.py b/build/torch211-cxx11-cu128-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/benchmark_util.py b/build/torch211-cxx11-cu128-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layers.py b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/cumsum.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/gather.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/repeat.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/replicate.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/round_up.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/scatter.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/sort.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/sum.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/ops/topology.py b/build/torch211-cxx11-cu128-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/matrix.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/arguments.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/common.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/gelu.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/glu.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/mlp.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/moe.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/mpu.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/router.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..767c0720dd56d80d3e809cbe78db66791cefbc43
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adea81c43411e3594ad56b695b6913a6b03ccffa516d582f4cf6a6dba57bab04
+size 21009984
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_version.py b/build/torch211-cxx11-cu128-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/backend/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/backend/kernels.py b/build/torch211-cxx11-cu128-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/benchmark_util.py b/build/torch211-cxx11-cu128-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layers.py b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/cumsum.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/gather.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/repeat.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/replicate.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/round_up.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/scatter.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/sort.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/sum.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/ops/topology.py b/build/torch211-cxx11-cu128-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/matrix.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/arguments.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/common.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/gelu.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/glu.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/mlp.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/moe.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/mpu.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/router.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..49f9a4a2738530bc50ce8497de9d5206075a5f2e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0926657a5bf049020d315e3281ca24c455d318a0ec8d9afc14665a79f8c2f19c
+size 12073200
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_version.py b/build/torch211-cxx11-cu130-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/backend/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/backend/kernels.py b/build/torch211-cxx11-cu130-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/benchmark_util.py b/build/torch211-cxx11-cu130-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layers.py b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/cumsum.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/gather.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/repeat.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/replicate.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/round_up.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/scatter.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/sort.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/sum.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/ops/topology.py b/build/torch211-cxx11-cu130-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/matrix.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/activation_fn.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/all_to_all.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/arguments.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/common.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmoe.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/gelu.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/glu.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/memory_test.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/mlp.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/moe.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/mpu.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/router.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e07c8ab8c5e122896ef80f10303311422b02dc06
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f548de7e56f7b35bddd555b88836ff77d731dfa6d71c52c2198a54607dba186
+size 12041592
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_version.py b/build/torch211-cxx11-cu130-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/backend/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/backend/kernels.py b/build/torch211-cxx11-cu130-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/benchmark_util.py b/build/torch211-cxx11-cu130-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/cpu_fused_moe.py b/build/torch211-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm_util.py b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layers.py b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/megablocks/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_gather.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_scatter.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/cumsum.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/gather.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_gather.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/repeat.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/replicate.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/round_up.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/scatter.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/sort.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/stk_autocast.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/sum.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/ops/topology.py b/build/torch211-cxx11-cu130-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/autocast.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/matrix.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/random/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/xpu_fused_moe.py b/build/torch211-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/arguments.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/common.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/gelu.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/glu.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mlp.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/moe.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mpu.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/router.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so b/build/torch211-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ee486c8abce8a03e1a612e789037d4c3a4793807
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_megablocks_xpu_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a732142f2d8813f0cbfc6fd912e421b57707789a7a35b8063b141b52182dfc5
+size 5381792
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8dd6eeccd632df5e23111e5dd5221d3e1fcb47
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_xpu_7a6bcf4
+ops = torch.ops._megablocks_xpu_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_xpu_7a6bcf4::{op_name}"
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_version.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/backend/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/backend/kernels.py b/build/torch211-cxx11-xpu20253-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/benchmark_util.py b/build/torch211-cxx11-xpu20253-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py b/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py b/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py b/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/cumsum.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/gather.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/repeat.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/replicate.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/round_up.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/scatter.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sum.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/ops/topology.py b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/matrix.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py b/build/torch211-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/activation_fn.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/all_to_all.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/arguments.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/common.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmlp_registry.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmoe.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/gelu.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/glu.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/memory_test.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mlp.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/moe.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mpu.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/router.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/sharedexpert_registry.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0d80074bae15f25f2ac4a90a2f5511cb5d01309c
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ee1601097f38f9ba908bad9f2844b50f1ffdd52379ec9548c7873b34ee00271
+size 10509584
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec290dd41dd30ed4551035db04f6c85ee1a0fe0
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_885c7a2
+ops = torch.ops._megablocks_885c7a2
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_885c7a2::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_version.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/kernels.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/benchmark_util.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/backend.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm_util.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b835ac5f6929edb8b547f373212388f34be3868
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py
@@ -0,0 +1,1225 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_gather.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_scatter.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/cumsum.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/gather.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/matmul_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_gather.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/permute_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/repeat.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/replicate.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/round_up.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/scatter.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort_benchmark.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/stk_autocast.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sum.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/topology.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/autocast.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/sputnik.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/triton_kernels.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/matrix.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops_test.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops_test.py b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/megablocks/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/activation_fn.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/all_to_all.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/arguments.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/common.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmlp_registry.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmoe.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/gelu.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/glu.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/memory_test.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mlp.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/moe.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mpu.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/router.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/sharedexpert_registry.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..78f4bd294d6978983cb2f4940fe24d66fc47c5f8
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac18b8df258ecd2e14581e6ec042b55d296f63ec1ab9d1704d5332a0d9cef05a
+size 11918752
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec290dd41dd30ed4551035db04f6c85ee1a0fe0
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_885c7a2
+ops = torch.ops._megablocks_885c7a2
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_885c7a2::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_version.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/benchmark_util.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/backend.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm_util.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b835ac5f6929edb8b547f373212388f34be3868
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py
@@ -0,0 +1,1225 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_gather.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_scatter.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/cumsum.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/gather.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/matmul_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_gather.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/permute_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/repeat.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/replicate.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/round_up.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/scatter.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort_benchmark.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/stk_autocast.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sum.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/topology.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/autocast.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/sputnik.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/triton_kernels.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/matrix.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops_test.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops_test.py b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/megablocks/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/activation_fn.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/all_to_all.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/arguments.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/common.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmlp_registry.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmoe.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/gelu.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/glu.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/memory_test.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mlp.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/moe.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mpu.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/router.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/sharedexpert_registry.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d6127bc51af7aca7db0bc5948113eebd538fc945
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_885c7a2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ad7baca6130dfa611dfe75ee1691d31d651cf8a49725c6a1caa5bb0ed22ba48
+size 17876184
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec290dd41dd30ed4551035db04f6c85ee1a0fe0
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_885c7a2
+ops = torch.ops._megablocks_885c7a2
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_885c7a2::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_version.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/benchmark_util.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/backend.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm_util.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b835ac5f6929edb8b547f373212388f34be3868
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py
@@ -0,0 +1,1225 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/all_to_all_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_scatter.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/cumsum.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/gather.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/matmul_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_gather.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/permute_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/repeat.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/replicate.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/round_up.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/scatter.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort_benchmark.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sum.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/topology.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/autocast.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/sputnik.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/triton_kernels.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/matrix.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops_test.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops_test.py b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/megablocks/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/activation_fn.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/all_to_all.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/arguments.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/common.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmoe.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/gelu.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/glu.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/memory_test.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/mlp.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/moe.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/mpu.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/router.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch28-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_megablocks_4f35d2a.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_megablocks_4f35d2a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..53500b2527dbc4099ec960d6afcc290797c61fec
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_megablocks_4f35d2a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c25df724a678c8783c3252180f7bcc124e52baf2f6f4f8bd6d69ec2ff2c58e7d
+size 15046832
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..69479df3044627d4a8ac3fb70d0b1f0e9b22deed
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_4f35d2a
+ops = torch.ops._megablocks_4f35d2a
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_4f35d2a::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_version.py b/build/torch28-cxx11-cu126-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/backend/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/backend/kernels.py b/build/torch28-cxx11-cu126-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/benchmark_util.py b/build/torch28-cxx11-cu126-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm_util.py b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a66957b08d748fd5b4fca8ad5f2c68c81cf429
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,1230 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/megablocks/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_gather.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_scatter.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/cumsum.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/gather.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_gather.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/repeat.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/replicate.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/round_up.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/scatter.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/sort.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/stk_autocast.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/sum.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/ops/topology.py b/build/torch28-cxx11-cu126-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/autocast.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/matrix.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/random/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/xpu_fused_moe.py b/build/torch28-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e7c6692f101f9141e9d716c8af6ac92be95351
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops
+
+
+# Install meta kernels for torch.compile compatibility
+def _install_xpu_meta_kernels():
+    """Install meta kernels for XPU MoE operations to support torch.compile"""
+    
+    # Patch cutlass_grouped_gemm_interface
+    if hasattr(ops, "cutlass_grouped_gemm_interface"):
+        original_gemm = ops.cutlass_grouped_gemm_interface
+        
+        def gemm_with_meta(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, 
+                          expert_first_token_offset, N, K, num_experts,
+                          is_B_int4, is_B_mxfp4):
+            if torch.compiler.is_compiling():
+                # Meta implementation - ptr_D is the output, return it
+                return ptr_D
+            return original_gemm(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D,
+                               expert_first_token_offset, N, K, num_experts,
+                               is_B_int4, is_B_mxfp4)
+        
+        ops.cutlass_grouped_gemm_interface = gemm_with_meta
+    
+    # Patch fused_moe_prologue
+    if hasattr(ops, "fused_moe_prologue"):
+        original_prologue = ops.fused_moe_prologue
+        
+        def prologue_with_meta(input, token_selected_experts, token_final_scales,
+                              workspace, hidden_size, inter_size, num_experts_on_rank):
+            if torch.compiler.is_compiling():
+                # Meta implementation - this op modifies workspace in-place
+                return None
+            return original_prologue(input, token_selected_experts, token_final_scales,
+                                    workspace, hidden_size, inter_size, num_experts_on_rank)
+        
+        ops.fused_moe_prologue = prologue_with_meta
+    
+    # Patch moe_gather
+    if hasattr(ops, "moe_gather"):
+        original_gather = ops.moe_gather
+        
+        def gather_with_meta(output, moe_output, topk_weights, 
+                            unpermuted_row_to_permuted_row, num_experts):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output is modified in-place
+                return None
+            return original_gather(output, moe_output, topk_weights,
+                                  unpermuted_row_to_permuted_row, num_experts)
+        
+        ops.moe_gather = gather_with_meta
+    
+    # Patch activation ops
+    for act_name in ["silu_and_mul", "gelu_and_mul", "gelu_tanh_and_mul", 
+                     "gelu_fast", "gelu_new", "gelu_quick", "mul_and_silu",
+                     "swigluoai_and_mul"]:
+        if hasattr(ops, act_name):
+            original_act = getattr(ops, act_name)
+            
+            def make_act_wrapper(orig_fn):
+                def act_with_meta(*args, **kwargs):
+                    if torch.compiler.is_compiling():
+                        # Meta implementation - in-place ops, return None
+                        return None
+                    return orig_fn(*args, **kwargs)
+                return act_with_meta
+            
+            setattr(ops, act_name, make_act_wrapper(original_act))
+
+
+# Install meta kernels on module load
+_install_xpu_meta_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size].view(torch.int64)
+    unpermuted_row_to_permuted_row = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size].view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                unpermuted_row_to_permuted_row,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=moe_num_experts,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/activation_fn.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/all_to_all.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/arguments.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/common.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmoe.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/gelu.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/glu.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/memory_test.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/mlp.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/moe.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/mpu.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/router.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch28-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_megablocks_4f35d2a.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_megablocks_4f35d2a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..13621e1ff5f2f2b5852f2977186205013d3b2b62
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_megablocks_4f35d2a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30359e1959d207a4ba71879037aafd22b8dada4c664c191dc4310f8b108131f8
+size 20995704
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..69479df3044627d4a8ac3fb70d0b1f0e9b22deed
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_4f35d2a
+ops = torch.ops._megablocks_4f35d2a
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_4f35d2a::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_version.py b/build/torch28-cxx11-cu128-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/backend/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/backend/kernels.py b/build/torch28-cxx11-cu128-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/benchmark_util.py b/build/torch28-cxx11-cu128-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm_util.py b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a66957b08d748fd5b4fca8ad5f2c68c81cf429
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,1230 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/megablocks/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_gather.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_scatter.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/cumsum.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/gather.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_gather.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/repeat.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/replicate.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/round_up.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/scatter.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/sort.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/stk_autocast.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/sum.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/ops/topology.py b/build/torch28-cxx11-cu128-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/autocast.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/matrix.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/random/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/xpu_fused_moe.py b/build/torch28-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e7c6692f101f9141e9d716c8af6ac92be95351
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops
+
+
+# Install meta kernels for torch.compile compatibility
+def _install_xpu_meta_kernels():
+    """Install meta kernels for XPU MoE operations to support torch.compile"""
+    
+    # Patch cutlass_grouped_gemm_interface
+    if hasattr(ops, "cutlass_grouped_gemm_interface"):
+        original_gemm = ops.cutlass_grouped_gemm_interface
+        
+        def gemm_with_meta(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, 
+                          expert_first_token_offset, N, K, num_experts,
+                          is_B_int4, is_B_mxfp4):
+            if torch.compiler.is_compiling():
+                # Meta implementation - ptr_D is the output, return it
+                return ptr_D
+            return original_gemm(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D,
+                               expert_first_token_offset, N, K, num_experts,
+                               is_B_int4, is_B_mxfp4)
+        
+        ops.cutlass_grouped_gemm_interface = gemm_with_meta
+    
+    # Patch fused_moe_prologue
+    if hasattr(ops, "fused_moe_prologue"):
+        original_prologue = ops.fused_moe_prologue
+        
+        def prologue_with_meta(input, token_selected_experts, token_final_scales,
+                              workspace, hidden_size, inter_size, num_experts_on_rank):
+            if torch.compiler.is_compiling():
+                # Meta implementation - this op modifies workspace in-place
+                return None
+            return original_prologue(input, token_selected_experts, token_final_scales,
+                                    workspace, hidden_size, inter_size, num_experts_on_rank)
+        
+        ops.fused_moe_prologue = prologue_with_meta
+    
+    # Patch moe_gather
+    if hasattr(ops, "moe_gather"):
+        original_gather = ops.moe_gather
+        
+        def gather_with_meta(output, moe_output, topk_weights, 
+                            unpermuted_row_to_permuted_row, num_experts):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output is modified in-place
+                return None
+            return original_gather(output, moe_output, topk_weights,
+                                  unpermuted_row_to_permuted_row, num_experts)
+        
+        ops.moe_gather = gather_with_meta
+    
+    # Patch activation ops
+    for act_name in ["silu_and_mul", "gelu_and_mul", "gelu_tanh_and_mul", 
+                     "gelu_fast", "gelu_new", "gelu_quick", "mul_and_silu",
+                     "swigluoai_and_mul"]:
+        if hasattr(ops, act_name):
+            original_act = getattr(ops, act_name)
+            
+            def make_act_wrapper(orig_fn):
+                def act_with_meta(*args, **kwargs):
+                    if torch.compiler.is_compiling():
+                        # Meta implementation - in-place ops, return None
+                        return None
+                    return orig_fn(*args, **kwargs)
+                return act_with_meta
+            
+            setattr(ops, act_name, make_act_wrapper(original_act))
+
+
+# Install meta kernels on module load
+_install_xpu_meta_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size].view(torch.int64)
+    unpermuted_row_to_permuted_row = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size].view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                unpermuted_row_to_permuted_row,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=moe_num_experts,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/activation_fn.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/all_to_all.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/arguments.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/common.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmoe.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/gelu.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/glu.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/memory_test.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/mlp.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/moe.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/mpu.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/router.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch28-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_megablocks_4f35d2a.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_megablocks_4f35d2a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..778d495436e824f26c32d2932a721e98d5c38807
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_megablocks_4f35d2a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5738502578db7ee323b7c4fbf17f40cac29061c825f88ce0e9d331ec0f3e7f06
+size 16003376
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..69479df3044627d4a8ac3fb70d0b1f0e9b22deed
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_4f35d2a
+ops = torch.ops._megablocks_4f35d2a
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_4f35d2a::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_version.py b/build/torch28-cxx11-cu129-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/backend/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/backend/kernels.py b/build/torch28-cxx11-cu129-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/benchmark_util.py b/build/torch28-cxx11-cu129-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm_util.py b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a66957b08d748fd5b4fca8ad5f2c68c81cf429
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/layers.py
@@ -0,0 +1,1230 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/megablocks/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_gather.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_scatter.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/cumsum.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/gather.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_gather.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/repeat.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/replicate.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/round_up.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/scatter.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/sort.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/stk_autocast.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/sum.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/ops/topology.py b/build/torch28-cxx11-cu129-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/autocast.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/matrix.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/random/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/xpu_fused_moe.py b/build/torch28-cxx11-cu129-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e7c6692f101f9141e9d716c8af6ac92be95351
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops
+
+
+# Install meta kernels for torch.compile compatibility
+def _install_xpu_meta_kernels():
+    """Install meta kernels for XPU MoE operations to support torch.compile"""
+    
+    # Patch cutlass_grouped_gemm_interface
+    if hasattr(ops, "cutlass_grouped_gemm_interface"):
+        original_gemm = ops.cutlass_grouped_gemm_interface
+        
+        def gemm_with_meta(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, 
+                          expert_first_token_offset, N, K, num_experts,
+                          is_B_int4, is_B_mxfp4):
+            if torch.compiler.is_compiling():
+                # Meta implementation - ptr_D is the output, return it
+                return ptr_D
+            return original_gemm(ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D,
+                               expert_first_token_offset, N, K, num_experts,
+                               is_B_int4, is_B_mxfp4)
+        
+        ops.cutlass_grouped_gemm_interface = gemm_with_meta
+    
+    # Patch fused_moe_prologue
+    if hasattr(ops, "fused_moe_prologue"):
+        original_prologue = ops.fused_moe_prologue
+        
+        def prologue_with_meta(input, token_selected_experts, token_final_scales,
+                              workspace, hidden_size, inter_size, num_experts_on_rank):
+            if torch.compiler.is_compiling():
+                # Meta implementation - this op modifies workspace in-place
+                return None
+            return original_prologue(input, token_selected_experts, token_final_scales,
+                                    workspace, hidden_size, inter_size, num_experts_on_rank)
+        
+        ops.fused_moe_prologue = prologue_with_meta
+    
+    # Patch moe_gather
+    if hasattr(ops, "moe_gather"):
+        original_gather = ops.moe_gather
+        
+        def gather_with_meta(output, moe_output, topk_weights, 
+                            unpermuted_row_to_permuted_row, num_experts):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output is modified in-place
+                return None
+            return original_gather(output, moe_output, topk_weights,
+                                  unpermuted_row_to_permuted_row, num_experts)
+        
+        ops.moe_gather = gather_with_meta
+    
+    # Patch activation ops
+    for act_name in ["silu_and_mul", "gelu_and_mul", "gelu_tanh_and_mul", 
+                     "gelu_fast", "gelu_new", "gelu_quick", "mul_and_silu",
+                     "swigluoai_and_mul"]:
+        if hasattr(ops, act_name):
+            original_act = getattr(ops, act_name)
+            
+            def make_act_wrapper(orig_fn):
+                def act_with_meta(*args, **kwargs):
+                    if torch.compiler.is_compiling():
+                        # Meta implementation - in-place ops, return None
+                        return None
+                    return orig_fn(*args, **kwargs)
+                return act_with_meta
+            
+            setattr(ops, act_name, make_act_wrapper(original_act))
+
+
+# Install meta kernels on module load
+_install_xpu_meta_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size].view(torch.int64)
+    unpermuted_row_to_permuted_row = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size].view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                unpermuted_row_to_permuted_row,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=moe_num_experts,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/common.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/router.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_megablocks_cpu_6e04dec.abi3.so b/build/torch29-cxx11-cpu-x86_64-linux/_megablocks_cpu_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fc992d33633b7be30174d1b5dbbe46f6bb5aaea9
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_megablocks_cpu_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18348238274eb1b281afe628b09ca6a4a5b8267370aaed7bf34a2bd91c9b815b
+size 2201200
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9010966e70976a4a5febea9802b714fa9a068af4
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cpu_6e04dec
+ops = torch.ops._megablocks_cpu_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cpu_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/_version.py b/build/torch29-cxx11-cpu-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-cpu-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-cpu-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/layers.py b/build/torch29-cxx11-cpu-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/metadata.json b/build/torch29-cxx11-cpu-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5381dd80836f863378b9f33a559815688de9287
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/metadata.json
@@ -0,0 +1,5 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/gather.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/sort.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/sum.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/ops/topology.py b/build/torch29-cxx11-cpu-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cpu-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cpu-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/arguments.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/common.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/gelu.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/glu.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/mlp.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/moe.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/mpu.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/router.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..20de7d814a05bdd4ce30e7a8742261aa9f3b5f22
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:581f5d3cd17031f674e6da22c23430881408630004e4ece5a57f9c36583665b5
+size 15121720
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_version.py b/build/torch29-cxx11-cu126-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/backend/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/backend/kernels.py b/build/torch29-cxx11-cu126-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/benchmark_util.py b/build/torch29-cxx11-cu126-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/layers.py b/build/torch29-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/metadata.json b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/cumsum.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/gather.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/repeat.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/replicate.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/round_up.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/scatter.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/sort.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/sum.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/ops/topology.py b/build/torch29-cxx11-cu126-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/matrix.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/common.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/router.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..49c7e72e12b54f9cafe86f5fd108efd17175d314
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fae42809a452f57bb4ef6967a397029f4e557ad73424c1b68fb613070dcd3f0d
+size 15046832
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_version.py b/build/torch29-cxx11-cu126-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-cu126-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-cu126-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/layers.py b/build/torch29-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..155112c59509d3b4d07f4d090cbf57071e3f5217
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/gather.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/sort.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/sum.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/ops/topology.py b/build/torch29-cxx11-cu126-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/arguments.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/common.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/gelu.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/glu.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/mlp.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/moe.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/mpu.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/router.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..5011e7363f8536c27906751647ac7eee905efc70
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81684a3eed6a7fb374cdbba3cf65f1cd46f5392ddc6d4992d37186c3b15f5734
+size 21085456
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_version.py b/build/torch29-cxx11-cu128-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/backend/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/backend/kernels.py b/build/torch29-cxx11-cu128-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/benchmark_util.py b/build/torch29-cxx11-cu128-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/layers.py b/build/torch29-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/cumsum.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/gather.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/repeat.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/replicate.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/round_up.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/scatter.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/sort.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/sum.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/ops/topology.py b/build/torch29-cxx11-cu128-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/matrix.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/common.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/router.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..8bcd44dbaa99ef7a3a231720c1e2365db938586e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0349d7de015576f9dae76f82c321d491609d1ae84bc5f2cb8053891e167a0aca
+size 20995704
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_version.py b/build/torch29-cxx11-cu128-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-cu128-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-cu128-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layers.py b/build/torch29-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/gather.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/sort.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/sum.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/ops/topology.py b/build/torch29-cxx11-cu128-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/arguments.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/common.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/gelu.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/glu.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/mlp.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/moe.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/mpu.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/router.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu129-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..dab216ec9f834709a36442fd6d8727e6129e1a74
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d457732aa8fa3b1c8d08d6e9d48d08e6b8fc211967df7e45a82e1d88e58c9728
+size 16035488
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_version.py b/build/torch29-cxx11-cu129-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/backend/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/backend/kernels.py b/build/torch29-cxx11-cu129-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/benchmark_util.py b/build/torch29-cxx11-cu129-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu129-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu129-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layers.py b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/cumsum.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/gather.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/repeat.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/replicate.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/round_up.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/scatter.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/sort.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/sum.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/ops/topology.py b/build/torch29-cxx11-cu129-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/matrix.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu129-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/common.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/router.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d6d595b1f0221f9d3793e58d2f549ef17693f655
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_megablocks_cuda_7a6bcf4.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94304fb698702f77c92b943ee0a64f00b26aedca7afa944c3a470de2ca7a13e5
+size 16003376
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..abde001f8cf5f78a02794d6e9a81fd8195e65d77
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_7a6bcf4
+ops = torch.ops._megablocks_cuda_7a6bcf4
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_7a6bcf4::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_version.py b/build/torch29-cxx11-cu129-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-cu129-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-cu129-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu129-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu129-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layers.py b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3e4edf582b7ffb515d0ed32e9fc9c89f125c441
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/gather.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/sort.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/sum.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/ops/topology.py b/build/torch29-cxx11-cu129-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu129-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/arguments.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/common.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/gelu.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/glu.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/mlp.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/moe.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/mpu.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/router.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..51d57a635e3e06c4a428fea2de0175b62370f823
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8669b2a5cf6f36ab1d6c518040d4f4e2874d7b1c5880b4424d21f89c60e77c5f
+size 12070448
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_version.py b/build/torch29-cxx11-cu130-aarch64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/backend/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/backend/kernels.py b/build/torch29-cxx11-cu130-aarch64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/benchmark_util.py b/build/torch29-cxx11-cu130-aarch64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/layers.py b/build/torch29-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/metadata.json b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/cumsum.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/gather.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/repeat.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/replicate.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/round_up.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/scatter.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/sort.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/sum.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/ops/topology.py b/build/torch29-cxx11-cu130-aarch64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/matrix.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/common.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/router.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3548f7fb815188fc523c90fa2111a7b14bf82e95
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_megablocks_cuda_6e04dec.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e1383adbf7afa208f0769d84a826fcd43de9ee9ce39d676ebce97698759c526
+size 12031416
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f202b8db3c3f3028303ab4308cf35f950e2c74
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_cuda_6e04dec
+ops = torch.ops._megablocks_cuda_6e04dec
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_cuda_6e04dec::{op_name}"
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_version.py b/build/torch29-cxx11-cu130-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-cu130-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-cu130-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/layers.py b/build/torch29-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9813b81c6c98110d265c184f2016d728202289b
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "12.0",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/gather.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/sort.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/sum.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/ops/topology.py b/build/torch29-cxx11-cu130-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/activation_fn.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/all_to_all.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/arguments.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/common.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmlp_registry.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmoe.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/gelu.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/glu.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/memory_test.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mlp.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/moe.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mpu.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/router.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/sharedexpert_registry.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_megablocks_xpu_a45325d.abi3.so b/build/torch29-cxx11-xpu20252-x86_64-linux/_megablocks_xpu_a45325d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..690eead6de2b9eba259e73b756f7a280bdd33c63
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_megablocks_xpu_a45325d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9da3abdc02eb695338490793988e2c315411f3bf732e8839af05f41eb3aec66
+size 5197008
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be29157b8f34992dd924071221b419a35a9145f
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _megablocks_xpu_a45325d
+ops = torch.ops._megablocks_xpu_a45325d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_megablocks_xpu_a45325d::{op_name}"
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_version.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/backend/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/backend/kernels.py b/build/torch29-cxx11-xpu20252-x86_64-linux/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/benchmark_util.py b/build/torch29-cxx11-xpu20252-x86_64-linux/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_fused_moe.py b/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29e81d6e9c57f64b0a78ac1c0828e45fd9d855
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_fused_moe.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks CPU Fused MoE Implementation
+#
+# This is a pure Python/PyTorch implementation for CPU.
+# For better performance, consider using the C++ kernel implementation.
+#
+import torch
+import torch.nn.functional as F
+
+
+def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
+                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
+    """
+    SwigluOAI activation function used in GptOss models.
+    
+    Formula:
+        gate = clamp(gate, max=limit)
+        up = clamp(up, -limit, limit)
+        glu = gate * sigmoid(gate * alpha)
+        output = (up + 1) * glu
+    
+    Args:
+        gate: Gate tensor from gate projection
+        up: Up tensor from up projection  
+        alpha: Scaling factor for sigmoid (default: 1.702)
+        limit: Clamp limit (default: 7.0)
+    
+    Returns:
+        Activated tensor
+    """
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
+def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    SiLU (Swish) activation with element-wise multiplication.
+    
+    Formula:
+        output = silu(gate) * up
+    
+    Args:
+        gate: Gate tensor
+        up: Up tensor
+    
+    Returns:
+        Activated tensor
+    """
+    return F.silu(gate) * up
+
+
+def route_tokens_cpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor | None,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_normalize_expert_weights: int | None = None,
+) -> tuple:
+    """
+    Route tokens to experts and compute expert weights and indices (CPU version).
+    
+    Args:
+        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
+        router_weight: Router weight [num_experts, hidden]
+        router_bias: Router bias [num_experts] or None
+        moe_top_k: Number of experts per token
+        moe_num_experts: Total number of experts
+        moe_normalize_expert_weights: Normalization order or None
+    
+    Returns:
+        Tuple of (logits, expert_weights, expert_indices)
+    """
+    x_flat = x.view(-1, x.shape[-1])
+    logits = F.linear(x_flat, router_weight, router_bias)
+    
+    if moe_top_k == 1:
+        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
+    else:
+        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
+    
+    expert_weights = expert_weights.softmax(dim=-1)
+    
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    
+    return logits, expert_weights, expert_indices
+
+
+def cpu_fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    activation: str = "silu",
+    alpha: float = 1.702,
+    limit: float = 7.0,
+    is_interleaved: bool = True,
+) -> torch.Tensor:
+    """
+    CPU Fused MoE using PyTorch operations.
+    
+    This implementation processes all experts in parallel using batched operations
+    instead of sequential for loops, which is more efficient on CPU.
+    
+    Args:
+        hidden_states: [num_tokens, hidden_size]
+        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
+        w2: [num_experts, inter_size, hidden_size] - down_proj weights
+        topk_weights: [num_tokens, topk] - routing weights
+        topk_ids: [num_tokens, topk] - expert indices
+        w1_bias: [num_experts, 2*inter_size] or None
+        w2_bias: [num_experts, hidden_size] or None
+        activation: "silu" or "swigluoai"
+        alpha: swigluoai alpha parameter
+        limit: swigluoai limit parameter
+        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
+    
+    Returns:
+        output: [num_tokens, hidden_size]
+    """
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts = w1.shape[0]
+    inter_size = w2.shape[1]
+    topk = topk_weights.shape[1]
+    
+    # Initialize output
+    output = torch.zeros_like(hidden_states)
+    
+    # Build expert mask: which tokens go to which expert
+    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
+    for expert_idx in range(num_experts):
+        # Find tokens assigned to this expert
+        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
+        mask = (topk_ids == expert_idx)
+        
+        if not mask.any():
+            continue
+        
+        # Get token indices and topk positions
+        token_indices, topk_positions = torch.where(mask)
+        
+        if len(token_indices) == 0:
+            continue
+        
+        # Gather input tokens for this expert
+        # current_hidden: [num_selected_tokens, hidden_size]
+        current_hidden = hidden_states[token_indices]
+        
+        # Get weights for this expert
+        # w1[expert_idx]: [hidden_size, 2*inter_size]
+        # w2[expert_idx]: [inter_size, hidden_size]
+        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
+        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
+        
+        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
+        gate_up = current_hidden @ expert_w1
+        
+        # Add bias if present
+        if w1_bias is not None:
+            gate_up = gate_up + w1_bias[expert_idx]
+        
+        # Split gate and up projections
+        if is_interleaved:
+            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
+            gate = gate_up[..., ::2]   # [num_selected, inter_size]
+            up = gate_up[..., 1::2]    # [num_selected, inter_size]
+        else:
+            # Standard layout: [gate_all, up_all]
+            gate = gate_up[..., :inter_size]
+            up = gate_up[..., inter_size:]
+        
+        # Apply activation
+        if activation == "swigluoai":
+            activated = swigluoai_activation(gate, up, alpha, limit)
+        else:  # silu
+            activated = silu_and_mul_activation(gate, up)
+        
+        # Second projection: activated @ w2 -> [num_selected, hidden_size]
+        expert_out = activated @ expert_w2
+        
+        # Add bias if present
+        if w2_bias is not None:
+            expert_out = expert_out + w2_bias[expert_idx]
+        
+        # Apply routing weights and accumulate
+        # weights shape: [num_selected]
+        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
+        weighted_out = expert_out * weights
+        
+        # Accumulate to output
+        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
+    
+    return output
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    """
+    CPU MoE MLP module that can be used as a drop-in replacement for
+    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
+    """
+    can_torch_compile: bool = True
+    
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+            is_interleaved = True  # GptOss uses interleaved layout
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+            is_interleaved = False
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call CPU fused MoE
+        output = cpu_fused_moe(
+            hidden_states=x_flat,
+            w1=w1,
+            w2=w2,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            activation=activation,
+            alpha=alpha,
+            limit=limit,
+            is_interleaved=is_interleaved,
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "cpu_fused_moe",
+    "route_tokens_cpu",
+    "swigluoai_activation",
+    "silu_and_mul_activation",
+]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py b/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..073ff66d24ce348fbb5ed19c9027fadd3f7a9c61
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/cpu_moe_cpp.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks C++ Optimized CPU MoE
+
+"""
+C++ accelerated MoE with brgemm optimization for Intel AMX.
+Direct replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+"""
+
+import torch
+from typing import Optional
+from .cpu_fused_moe import route_tokens_cpu
+from ._ops import ops
+
+
+def _to_local_tensor(tensor: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """Convert DTensor to local torch.Tensor if needed for custom ops compatibility."""
+    if tensor is None:
+        return None
+    # Check if it's a DTensor by looking for the to_local() method
+    if hasattr(tensor, "to_local"):
+        return tensor.to_local()
+    return tensor
+
+
+def fused_moe_cpp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    use_int8_w8a8: bool = False,
+    use_fp8_w8a16: bool = False,
+    use_mxfp4: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    block_size: Optional[list] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    w1_bias: Optional[torch.Tensor] = None,
+    w2_bias: Optional[torch.Tensor] = None,
+    alpha: Optional[float] = None,
+    limit: Optional[float] = None,
+    is_vnni: bool = False,
+) -> torch.Tensor:
+    """
+    C++ Fused MoE with brgemm optimization (sglang compatible interface).
+    
+    Uses at::native::cpublas::brgemm for efficient batch GEMM on Intel CPUs.
+    Supports both silu_and_mul (standard SwiGLU) and swigluoai (GptOss) activations.
+    
+    Args:
+        hidden_states: Input tensor [M, K]
+        w1: Gate and up projections [E, 2N, K]
+        w2: Down projection [E, K, N]
+        topk_weights: Expert weights [M, topk]
+        topk_ids: Expert indices [M, topk]
+        inplace: Whether to use hidden_states as output
+        use_int8_w8a8: Use int8 quantization
+        use_fp8_w8a16: Use fp8 quantization
+        use_mxfp4: Use mxfp4 quantization
+        w1_scale, w2_scale: Quantization scales
+        block_size: Block size for fp8
+        a1_scale, a2_scale: Activation scales
+        w1_bias, w2_bias: Optional biases
+        alpha: swigluoai alpha parameter (set to enable swiglu)
+        limit: swigluoai limit parameter (set to enable swiglu)
+        is_vnni: Whether w1/w2 are already in VNNI packed format
+    """
+    # MXFP4/FP8 kernels only support bf16, convert if needed
+    orig_dtype = hidden_states.dtype
+    need_convert = ((use_mxfp4 or use_fp8_w8a16) and orig_dtype != torch.bfloat16) or orig_dtype == torch.float32
+    if need_convert:
+        hidden_states = hidden_states.to(torch.bfloat16)
+
+    # bias must match hidden_states dtype
+    if w1_bias is not None:
+        w1_bias = w1_bias.to(hidden_states.dtype)
+    if w2_bias is not None:
+        w2_bias = w2_bias.to(hidden_states.dtype)
+
+    # Convert DTensor to local tensor for custom ops compatibility (TP mode)
+    hidden_states = _to_local_tensor(hidden_states)
+    w1 = _to_local_tensor(w1)
+    w2 = _to_local_tensor(w2)
+    topk_weights = _to_local_tensor(topk_weights)
+    topk_ids = _to_local_tensor(topk_ids)
+    w1_scale = _to_local_tensor(w1_scale)
+    w2_scale = _to_local_tensor(w2_scale)
+    a1_scale = _to_local_tensor(a1_scale)
+    a2_scale = _to_local_tensor(a2_scale)
+    w1_bias = _to_local_tensor(w1_bias)
+    w2_bias = _to_local_tensor(w2_bias)
+    
+    output = ops.fused_experts(
+        hidden_states, w1, w2, topk_weights, topk_ids,
+        inplace, use_int8_w8a8, use_fp8_w8a16, use_mxfp4,
+        w1_scale, w2_scale, block_size, a1_scale, a2_scale,
+        w1_bias, w2_bias, alpha, limit, is_vnni
+    )
+    
+    # Convert back to original dtype if needed
+    if need_convert:
+        output = output.to(orig_dtype)
+    return output
+
+
+class CPUMegaBlocksMoeMLP(torch.nn.Module):
+    """
+    C++ optimized MoE MLP using brgemm.
+    Drop-in replacement for cpu_fused_moe.MegaBlocksMoeMLP with better performance.
+    
+    Usage in transformers:
+        # Will be used via @use_kernel_forward_from_hub decorator
+    """
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer using C++ kernel.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights)
+        """
+        # Optimization for GPT-OSS model
+        if getattr(self, "use_mxfp4", None) is None:
+            self.use_mxfp4 = False
+
+        w1_scale = None
+        w2_scale = None
+
+        if (
+            not getattr(self, "packed_scales", False)
+            and hasattr(self.experts, "gate_up_proj")
+            and getattr(self.experts, "gate_up_proj_precision_config", None) is not None
+        ):
+            # convert scales
+            data_1 = ops.convert_scale_packed(self.experts.gate_up_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            data_2 = ops.convert_scale_packed(self.experts.down_proj_precision_config.weight_scale.data.transpose(-1, -2).contiguous())
+            self.experts.gate_up_proj_precision_config.weight_scale.storage.data = data_1
+            self.experts.down_proj_precision_config.weight_scale.storage.data = data_2
+            self.packed_scales = True
+            self.use_mxfp4 = True
+
+        if not getattr(self, "packed_weight", False) and hasattr(
+            self.experts, "gate_up_proj"
+        ):
+            # convert weights
+            data_1 = self.experts.gate_up_proj.data.transpose(-1, -2).contiguous()
+            data_2 = self.experts.down_proj.data.transpose(-1, -2).contiguous()
+            if self.use_mxfp4:
+                self.experts.gate_up_proj.storage.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.storage.data = ops.convert_weight_packed(data_2)
+            else:
+                # convert_weight_packed only supports bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).
+                data_1 = data_1.to(torch.bfloat16) if data_1.dtype == torch.float32 else data_1
+                data_2 = data_2.to(torch.bfloat16) if data_2.dtype == torch.float32 else data_2
+                self.experts.gate_up_proj.data = ops.convert_weight_packed(data_1)
+                self.experts.down_proj.data = ops.convert_weight_packed(data_2)
+
+            # C++ kernel does not support float32.
+            dtype = torch.bfloat16 if x.dtype == torch.float32 else x.dtype
+            if getattr(self.experts, "gate_up_proj_bias", None) is not None:
+                self.experts.gate_up_proj_bias.data = self.experts.gate_up_proj_bias.data.to(dtype)
+            if getattr(self.experts, "down_proj_bias", None) is not None:
+                self.experts.down_proj_bias.data = self.experts.down_proj_bias.data.to(dtype)
+
+            self.packed_weight = True
+
+        # Get MoE parameters
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_normalize_expert_weights = getattr(self.experts, "normalize_expert_weights", None)
+        
+        # Detect activation type
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+            alpha = self.experts.alpha
+            limit = self.experts.limit
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+            alpha = 1.702
+            limit = 7.0
+        
+        # Get weight tensors
+        if hasattr(self.experts, "gate_up_proj"):
+            w1 = self.experts.gate_up_proj
+        elif hasattr(self.experts, "w1"):
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w1 = torch.cat([w1, w3], dim=-1)
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        w1_bias = w1_bias if w1_bias is None else w1_bias.data
+        w2_bias = w2_bias if w2_bias is None else w2_bias.data
+
+        if self.use_mxfp4:
+            w1_scale = self.experts.gate_up_proj_precision_config.weight_scale.data
+            w2_scale = self.experts.down_proj_precision_config.weight_scale.data
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts (Python implementation is fast enough)
+        logits, expert_weights, expert_indices = route_tokens_cpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_normalize_expert_weights,
+        )
+        
+        # Flatten input
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Determine alpha/limit for swiglu activation
+        use_alpha = alpha if activation == "swigluoai" else None
+        use_limit = limit if activation == "swigluoai" else None
+        
+        # Call C++ optimized kernel
+        output = fused_moe_cpp(
+            hidden_states=x_flat,
+            w1=w1.data,
+            w2=w2.data,
+            topk_weights=expert_weights,
+            topk_ids=expert_indices.to(torch.int32),
+            inplace=False,
+            use_int8_w8a8=False,
+            use_fp8_w8a16=False,
+            use_mxfp4=self.use_mxfp4,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_size=None,
+            a1_scale=None,
+            a2_scale=None,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            alpha=use_alpha,
+            limit=use_limit,
+            is_vnni=getattr(self, "packed_weight", False),
+        )
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+__all__ = ["fused_moe_cpp", "MegaBlocksMoeMLP"]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/backend.py b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm_util.py b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py b/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c91edcd9e2d1a4ef9eac90217ff481f08ab1886
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/layers.py
@@ -0,0 +1,1232 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                # print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
+
+
+# Patch for XPU or CPU support
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .xpu_fused_moe import MegaBlocksMoeMLP
+
+from .cpu_moe_cpp import CPUMegaBlocksMoeMLP
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/megablocks/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/megablocks/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..b911d0a2549a35a1c65ab7e77d32e5aac23cd6ac
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/all_to_all_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_gather.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_scatter.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/cumsum.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/gather.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b9c6047567b87a295979498142230d1b0c9411
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/matmul_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4b9b8866ffed2eb769b77f2320c82e5491ae0e
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_gather.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe4735891446b46f93170c64c23fe63632bf93
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/permute_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..697abddbb3a2082ec4ddd6d94f89f7faabb34b40
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+# from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/repeat.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/replicate.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/round_up.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/scatter.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort_benchmark.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..11043c0824c36372585f1d9f48480c2a6ef32eb6
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/stk_autocast.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sum.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/ops/topology.py b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/autocast.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/sputnik.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/triton_kernels.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/matrix.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops_test.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939372a5c68ac92b47b11015db4f75f4fd60ffa
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+# from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops_test.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1c24d350df9c1b2346c7da885502cd696c88867
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+# from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops_test.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d172d921f6f08b0e4fb709207a458b0e1e071bd0
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+# from absl.testing import parameterized
+import stk
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops_test.py b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d716b78b5ec009cbf9ac2dfdf09162a0102e62
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+# from absl.testing import parameterized
+from . import random
+import torch
+
+
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/xpu_fused_moe.py b/build/torch29-cxx11-xpu20252-x86_64-linux/xpu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440b484e6f4073a384dcf3bc562174d2ef71d25
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/xpu_fused_moe.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# MegaBlocks XPU Fused MoE Implementation
+import os
+import torch
+
+from ._ops import ops, add_op_namespace_prefix
+
+from torch.library import register_fake
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    """Convert DTensor to local tensor for use with custom ops."""
+    from torch.distributed._tensor import DTensor
+    if isinstance(weight, DTensor):
+        return weight.to_local()
+    return weight
+
+
+# Register fake/meta kernels for torch.compile compatibility
+def _register_xpu_fake_kernels():
+    """Register fake kernels for XPU MoE operations to support torch.compile."""
+
+    def _register_if_available(op_name, fn):
+        if hasattr(ops, op_name):
+            register_fake(add_op_namespace_prefix(op_name))(fn)
+
+    _register_if_available(
+        "cutlass_grouped_gemm_interface",
+        lambda ptr_A, ptr_B, ptr_scales, ptr_bias, ptr_D, expert_first_token_offset, N, K, num_experts, is_B_int4, is_B_mxfp4: ptr_D,
+    )
+
+    _register_if_available(
+        "fused_moe_prologue",
+        lambda input, token_selected_experts, token_final_scales, workspace, hidden_size, inter_size, ep_rank, ep_size, num_experts_on_rank: None,
+    )
+
+    _register_if_available(
+        "moe_gather",
+        lambda output, moe_output, topk_weights, permuted_row_to_unpermuted_row, unpermuted_row_to_permuted_row, expert_first_token_offset, num_experts: None,
+    )
+
+    _register_if_available(
+        "silu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "mul_and_silu",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_tanh_and_mul",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_fast",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_new",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "gelu_quick",
+        lambda out, input: None,
+    )
+    _register_if_available(
+        "swigluoai_and_mul",
+        lambda out, input, alpha=1.702, limit=7.0: None,
+    )
+
+
+# Register fake kernels on module load
+_register_xpu_fake_kernels()
+
+
+# default
+def cutlass_grouped_gemm(input_A, input_B, bias, output, expert_token_count, n,
+                         k, num_experts):
+    # expert_token_count_ = torch.tensor(expert_token_count,
+    #                                    dtype=torch.int64,
+    #                                    device=input_A.device)
+    # if bias is not None:
+    #     bias = bias.repeat_interleave(expert_token_count_, dim=0).float()
+
+    def exclusive_prefix_sum(arr):
+        prefix = [0]
+        for i, x in enumerate(arr):
+            prefix.append(prefix[-1] + x)
+        return prefix
+
+    expert_offset = torch.tensor(exclusive_prefix_sum(expert_token_count),
+                                 dtype=torch.int64,
+                                 device="xpu")
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=None,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=False,
+        is_B_mxfp4=False)
+
+
+def cutlass_grouped_gemm_xe2(input_A, input_B, scales, bias, output,
+                             num_rows_per_expert, n, k, num_experts, is_B_int4,
+                             is_B_mxfp4):
+    expert_first_token_offset = torch.cat([
+        torch.tensor([0],
+                     dtype=num_rows_per_expert.dtype,
+                     device=num_rows_per_expert.device),
+        torch.cumsum(num_rows_per_expert, dim=0)
+    ]).to(torch.int64)
+    ops.cutlass_grouped_gemm_interface(
+        ptr_A=input_A,
+        ptr_B=input_B,
+        ptr_scales=scales,
+        ptr_bias=bias,
+        ptr_D=output,
+        expert_first_token_offset=expert_first_token_offset,
+        N=n,
+        K=k,
+        num_experts=num_experts,
+        is_B_int4=is_B_int4,
+        is_B_mxfp4=is_B_mxfp4)
+
+
+def ceilDiv(a, b):
+    return (a + b - 1) // b
+
+
+def compute_num_tokens_per_block(num_tokens, num_experts_per_node):
+    for num_tokens_per_block in [32, 64, 128, 256, 512, 1024]:
+        num_blocks_per_seq = ceilDiv(num_tokens, num_tokens_per_block)
+        if num_blocks_per_seq * num_experts_per_node <= num_tokens_per_block:
+            return num_tokens_per_block
+    return 1024
+
+
+def _bytes_to_typed_tensor(byte_tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Reinterpret a uint8 buffer as a typed tensor by copying bytes.
+
+    This avoids `Tensor.view(dtype)` which can fail under torch.compile
+    constant folding when shape divisibility is not proven.
+    """
+    if byte_tensor.dtype != torch.uint8:
+        raise ValueError("byte_tensor must be uint8")
+    itemsize = torch.empty((), dtype=dtype).element_size()
+    numel = byte_tensor.numel() // itemsize
+    out = torch.empty((numel,), dtype=dtype, device=byte_tensor.device)
+    out.view(torch.uint8).copy_(byte_tensor.contiguous())
+    return out
+
+
+def implement_zp(qweight):
+    # change u4 to s4 to avoid zero point in gemm kernel
+    # only support default zero point now
+    assert qweight.dtype == torch.uint8, "Input tensor must be uint8"
+
+    high_u4 = (qweight >> 4) & 0x0F
+    low_u4 = qweight & 0x0F
+
+    high_s8 = high_u4.to(torch.int8)
+    low_s8 = low_u4.to(torch.int8)
+
+    high_s8 = high_s8 - 8
+    low_s8 = low_s8 - 8
+
+    def pack_compact(a, b):
+
+        def process_number(x):
+            sign = (x < 0).to(torch.uint8)
+            abs_low3 = (x.view(torch.uint8) & 0x7).to(torch.uint8)
+            return (sign << 3) | abs_low3
+
+        packed_a = process_number(a)
+        packed_b = process_number(b)
+
+        return (packed_a << 4) | packed_b
+
+    result = pack_compact(high_s8, low_s8)
+
+    return result
+
+
+def xpu_fused_moe(hidden_states,
+                  w13,
+                  w13_scales,
+                  w13_bias,
+                  w2,
+                  w2_scales,
+                  w2_bias,
+                  topk_weights,
+                  topk_ids,
+                  n_experts_per_token,
+                  activation,
+                  num_experts,
+                  ep_rank=0,
+                  ep_size=1,
+                  is_fp8=False,
+                  is_int4=False,
+                  is_mxfp4=False):
+    '''
+    hidden_states: [num_rows, hidden_size]
+    w13: [num_experts, 2*inter_size, hidden_size]
+    w13_scales: 
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, 2*inter_size, hidden_size // group_size] for 4bits
+    w13_bias: [num_experts, 2*inter_size] or None
+    w2: [num_experts, hidden_size, inter_size]
+    w2_scales:
+        None for bf16/fp16 
+        or [num_experts] for fp8 
+        or [num_experts, hidden_size, inter_size // group_size] for 4bits
+    w2_bias: [num_experts, hidden_size] or None
+    topk_weights: [num_rows, topk]
+    topk_ids: [num_rows, topk]
+    n_experts_per_token: int
+    activation: str
+    num_experts: int
+    is_int4: bool
+    is_mxfp4: bool
+    '''
+
+    # Resolve DTensors to local tensors before passing to custom ops
+    hidden_states = resolve_dtensor(hidden_states)
+    w13 = resolve_dtensor(w13)
+    w2 = resolve_dtensor(w2)
+    if w13_scales is not None:
+        w13_scales = resolve_dtensor(w13_scales)
+    if w13_bias is not None:
+        w13_bias = resolve_dtensor(w13_bias)
+    if w2_scales is not None:
+        w2_scales = resolve_dtensor(w2_scales)
+    if w2_bias is not None:
+        w2_bias = resolve_dtensor(w2_bias)
+    topk_weights = resolve_dtensor(topk_weights)
+    topk_ids = resolve_dtensor(topk_ids)
+
+    output = torch.empty_like(hidden_states)
+    num_rows, hidden_size = list(hidden_states.shape)
+
+    dim_last = w13.shape[-1]
+    dim_second_last = w13.shape[-2]
+
+    # w13 is combined gate+up weights, so one dimension is 2*inter_size
+    # Determine which dimension is hidden_size and which is 2*inter_size
+    if dim_second_last == hidden_size:
+        # w13 is [E, hidden_size, 2*inter_size] - standard layout
+        inter_size = dim_last // 2
+        needs_transpose = False
+    else:
+        # w13 is [E, 2*inter_size, hidden_size] - needs transpose
+        inter_size = dim_second_last // 2
+        needs_transpose = True
+
+    assert w13.is_contiguous() and w2.is_contiguous()
+
+    # 4bits support [E, N, K]
+    # other types [E, K, N]
+    if not is_int4 and not is_mxfp4:
+        if not hasattr(w13, 'xpu_fused_moe'):
+            if needs_transpose:
+                w13.data = w13.transpose(-1, -2).contiguous()
+                w2.data = w2.transpose(-1, -2).contiguous()
+            w13.xpu_fused_moe = True
+            w13.inter_size = inter_size
+        else:
+            inter_size = w13.inter_size
+
+    if is_int4 and not hasattr(w13, 'xpu_fused_moe'):
+        w13_tmp = torch.empty_like(w13)
+        w2_tmp = torch.empty_like(w2)
+        for i in range(num_experts):
+            w13_tmp[i] = implement_zp(w13[i])
+            w2_tmp[i] = implement_zp(w2[i])
+        w13_tmp = w13_tmp.contiguous()
+        w2_tmp = w2_tmp.contiguous()
+        w13.data = w13_tmp
+        w2.data = w2_tmp
+        w13.xpu_fused_moe = True
+
+    # TODO: will all integrated in Cpp func. Temporary expose before gemm fusion
+    num_experts_per_node = num_experts
+    experts_per_token = n_experts_per_token
+    num_moe_inputs = n_experts_per_token * num_rows
+    permuted_elems = num_moe_inputs * hidden_size
+    # interbuf_elems = num_moe_inputs * inter_size
+    permuted_row_to_unpermuted_row_size = num_moe_inputs * 4
+    permuted_token_selected_experts_size = num_moe_inputs * 4
+    src_to_dest_map_size = experts_per_token * num_rows * 4
+    expert_first_token_offset_size = (num_experts_per_node + 1) * 8
+    num_tokens_per_block = compute_num_tokens_per_block(
+        num_rows, num_experts_per_node)
+    num_blocks_per_seq = ceilDiv(num_rows, num_tokens_per_block)
+    blocked_expert_counts_size = num_experts_per_node * num_blocks_per_seq * 4
+    blocked_expert_counts_cumsum_size = blocked_expert_counts_size
+    blocked_row_to_unpermuted_row_size = num_experts_per_node * num_rows * 4
+    permuted_data_size = permuted_elems * hidden_states.element_size()
+    permuted_token_final_scales_size = num_moe_inputs * 4
+
+    ws_map = {}
+    map_offset = 0
+
+    def config_ws(name, size):
+        nonlocal map_offset
+        if size % 256 != 0:
+            size += 256 - size % 256
+        ws_map[name] = (size, map_offset)
+        map_offset += size
+
+    config_ws("permuted_row_to_unpermuted_row",
+              permuted_row_to_unpermuted_row_size)
+    config_ws("permuted_token_selected_experts",
+              permuted_token_selected_experts_size)
+    config_ws("unpermuted_row_to_permuted_row", src_to_dest_map_size)
+    config_ws("blocked_expert_counts", blocked_expert_counts_size)
+    config_ws("blocked_expert_counts_cumsum",
+              blocked_expert_counts_cumsum_size)
+    config_ws("blocked_row_to_unpermuted_row",
+              blocked_row_to_unpermuted_row_size)
+    config_ws("expert_first_token_offset", expert_first_token_offset_size)
+    config_ws("permuted_token_final_scales", permuted_token_final_scales_size)
+    config_ws("overlapped_gemm1_gemm2_inputs", permuted_data_size)
+
+    workspace = torch.zeros(map_offset,
+                            dtype=torch.uint8,
+                            device=hidden_states.device)
+    if topk_ids.dtype == torch.int32:
+        topk_ids = topk_ids.to(torch.int64)
+    ops.fused_moe_prologue(
+        input=hidden_states,
+        token_selected_experts=topk_ids,
+        token_final_scales=topk_weights,
+        workspace=workspace,
+        hidden_size=hidden_size,
+        inter_size=inter_size,
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        num_experts_on_rank=num_experts_per_node)
+
+    expert_first_token_offset_bytes = workspace[
+        ws_map["expert_first_token_offset"][1]:
+        ws_map["expert_first_token_offset"][1] +
+        expert_first_token_offset_size]
+    unpermuted_row_to_permuted_row_bytes = workspace[
+        ws_map["unpermuted_row_to_permuted_row"][1]:
+        ws_map["unpermuted_row_to_permuted_row"][1] +
+        src_to_dest_map_size]
+    permuted_row_to_unpermuted_row_bytes = workspace[
+        ws_map["permuted_row_to_unpermuted_row"][1]:
+        ws_map["permuted_row_to_unpermuted_row"][1] +
+        permuted_row_to_unpermuted_row_size]
+
+    if torch.compiler.is_compiling():
+        expert_first_token_offset = _bytes_to_typed_tensor(
+            expert_first_token_offset_bytes, torch.int64
+        )
+        unpermuted_row_to_permuted_row = _bytes_to_typed_tensor(
+            unpermuted_row_to_permuted_row_bytes, torch.int32
+        )
+        permuted_row_to_unpermuted_row = _bytes_to_typed_tensor(
+            permuted_row_to_unpermuted_row_bytes, torch.int32
+        )
+    else:
+        expert_first_token_offset = expert_first_token_offset_bytes.view(torch.int64)
+        unpermuted_row_to_permuted_row = unpermuted_row_to_permuted_row_bytes.view(torch.int32)
+        permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_bytes.view(torch.int32)
+    gemm1_input = workspace[ws_map["overlapped_gemm1_gemm2_inputs"][1]:
+                            ws_map["overlapped_gemm1_gemm2_inputs"][1] +
+                            permuted_data_size].view(hidden_states.dtype).view(
+                                num_moe_inputs, hidden_size)
+    # permuted_token_final_scales = workspace[
+    #     ws_map["permuted_token_final_scales"][1]:
+    #     ws_map["permuted_token_final_scales"][1] +
+    #     permuted_token_final_scales_size].view(torch.float)
+    gemm1_output = torch.empty((num_moe_inputs, 2 * inter_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+
+    ########### gemm1 ##################
+    input_B = w13
+
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=gemm1_input,
+            ptr_B=input_B,
+            ptr_scales=w13_scales,
+            ptr_bias=w13_bias,
+            ptr_D=gemm1_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=2 * inter_size,
+            K=hidden_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    # act
+    act_output = torch.empty((num_moe_inputs, inter_size),
+                             dtype=gemm1_output.dtype,
+                             device=gemm1_output.device)
+    if activation == "silu":
+        ops.silu_and_mul(act_output, gemm1_output)
+    elif activation == "gelu":
+        ops.gelu_and_mul(act_output, gemm1_output)
+    elif activation == "swigluoai":
+        ops.swigluoai_and_mul(act_output, gemm1_output, 1.702, 7.0)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}.")
+
+    ########### gemm2 ##################
+    input_A = act_output.contiguous()
+    input_B = w2
+    gemm2_output = torch.empty((num_moe_inputs, hidden_size),
+                               dtype=hidden_states.dtype,
+                               device=hidden_states.device)
+    if not is_fp8 and not is_int4 and not is_mxfp4:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=None,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+    else:
+        ops.cutlass_grouped_gemm_interface(
+            ptr_A=input_A,
+            ptr_B=input_B,
+            ptr_scales=w2_scales,
+            ptr_bias=w2_bias,
+            ptr_D=gemm2_output,
+            expert_first_token_offset=expert_first_token_offset,
+            N=hidden_size,
+            K=inter_size,
+            num_experts=num_experts_per_node,
+            is_B_int4=is_int4,
+            is_B_mxfp4=is_mxfp4)
+
+    ops.moe_gather(output, gemm2_output, topk_weights,
+                                permuted_row_to_unpermuted_row,
+                                unpermuted_row_to_permuted_row,
+                                expert_first_token_offset,
+                                num_experts_per_node)
+    return output
+
+
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    """Apply jitter to the input tensor for regularization."""
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    """Compute the top-k scores from the logits."""
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+def route_tokens_xpu(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    training: bool = False,
+) -> tuple:
+    """Route tokens to experts and compute expert weights and indices (XPU version)."""
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+def _get_device_mesh(model):
+    """Extract device_mesh from child's unused pre_hook closure for EP support."""
+    try:
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+        
+    def forward(self, x: torch.Tensor) -> tuple:
+        """
+        Forward pass through the MoE layer.
+        
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
+            
+        Returns:
+            Tuple of (output, expert_weights) where:
+                - output: Tensor of same shape as input
+                - expert_weights: Expert weights for each token [tokens, top_k]
+        """
+
+        # Get MoE parameters from the wrapped modules
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        
+        # Get EP (Expert Parallelism) parameters
+        ep_size = 1
+        ep_rank = 0
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = _get_device_mesh(self)
+            if device_mesh is not None:
+                expert_parallel_group = device_mesh.get_group()
+        if expert_parallel_group is not None:
+            import torch.distributed as dist
+            if dist.is_initialized():
+                ep_size = dist.get_world_size(expert_parallel_group)
+                ep_rank = dist.get_rank(expert_parallel_group)
+        
+        # Number of experts on this rank
+        num_experts_on_rank = moe_num_experts // ep_size
+        
+        # Detect activation type - check for GptOss-style swigluoai activation
+        # GptOssExperts has alpha and limit attributes for swigluoai
+        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
+            activation = "swigluoai"
+        else:
+            activation = getattr(self.experts, "activation", "silu")
+
+        # Get weight tensors - support different naming conventions
+        if hasattr(self.experts, "gate_up_proj"):
+            w13 = self.experts.gate_up_proj
+            # NOTE: swigluoai_and_mul kernel expects interleaved layout [g0,u0,g1,u1,...]
+            # which matches GptOss's gate_up_proj layout, so no conversion needed.
+                            
+        elif hasattr(self.experts, "w1"):
+            # Combine w1 and w3 if stored separately
+            w1 = self.experts.w1
+            w3 = getattr(self.experts, "w3", None)
+            if w3 is not None:
+                w13 = torch.cat([w1, w3], dim=-2)
+            else:
+                w13 = w1
+        else:
+            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
+        
+        if hasattr(self.experts, "down_proj"):
+            w2 = self.experts.down_proj
+        elif hasattr(self.experts, "w2"):
+            w2 = self.experts.w2
+        else:
+            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
+        
+        # Get optional bias tensors
+        w13_bias = getattr(self.experts, "gate_up_proj_bias", None)
+        w2_bias = getattr(self.experts, "down_proj_bias", None)
+        
+        # Get quantization info
+        is_fp8 = getattr(self.experts, "is_fp8", False)
+        is_int4 = getattr(self.experts, "is_int4", False)
+        is_mxfp4 = getattr(self.experts, "is_mxfp4", False)
+        
+        w13_scales = getattr(self.experts, "gate_up_proj_scales", None)
+        w2_scales = getattr(self.experts, "down_proj_scales", None)
+        
+        # Store original shape
+        in_shape = x.size()
+        
+        # Route tokens to experts
+        logits, expert_weights, expert_indices = route_tokens_xpu(
+            x,
+            self.router.weight,
+            getattr(self.router, "bias", None),
+            moe_top_k,
+            moe_num_experts,
+            moe_jitter_eps,
+            moe_normalize_expert_weights,
+            self.training,
+        )
+        
+        # Reshape input for fused MoE
+        x_flat = x.view(-1, x.shape[-1])
+        
+        # Call XPU fused MoE kernel
+        output = xpu_fused_moe(
+            hidden_states=x_flat,
+            w13=w13,
+            w13_scales=w13_scales,
+            w13_bias=w13_bias,
+            w2=w2,
+            w2_scales=w2_scales,
+            w2_bias=w2_bias,
+            topk_weights=expert_weights.float(),
+            topk_ids=expert_indices,
+            n_experts_per_token=moe_top_k,
+            activation=activation,
+            num_experts=num_experts_on_rank,
+            ep_rank=ep_rank,
+            ep_size=ep_size,
+            is_fp8=is_fp8,
+            is_int4=is_int4,
+            is_mxfp4=is_mxfp4,
+        )
+        
+        # All-reduce across EP group to combine partial expert outputs
+        if ep_size > 1 and expert_parallel_group is not None:
+            import torch.distributed as dist
+            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=expert_parallel_group)
+        
+        # Restore original shape
+        output = output.view(in_shape)
+        
+        return output, expert_weights
+
+
+# Export classes and functions
+__all__ = [
+    "MegaBlocksMoeMLP",
+    "xpu_fused_moe",
+    "cutlass_grouped_gemm",
+    "cutlass_grouped_gemm_xe2",
+]
\ No newline at end of file
diff --git a/csrc/bak.ops.cu b/csrc/bak.ops.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50884d8a88b473942abeb87d50b28d2f9d6e8025
--- /dev/null
+++ b/csrc/bak.ops.cu
@@ -0,0 +1,21 @@
+#include "cumsum.h"
+#include "histogram.h"
+#include "indices.h"
+#include "replicate.h"
+#include "sort.h"
+
+#include <torch/extension.h>
+
+namespace megablocks {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("exclusive_cumsum", &exclusive_cumsum, "batched exclusive cumsum.");
+  m.def("histogram", &histogram, "even width histogram.");
+  m.def("inclusive_cumsum", &inclusive_cumsum, "batched inclusive cumsum");
+  m.def("indices", &indices, "indices construction for sparse matrix.");
+  m.def("replicate_forward", &replicate_forward, "(fwd) replicate a vector dynamically.");
+  m.def("replicate_backward", &replicate_backward, "(bwd) replicate a vector dynamically.");
+  m.def("sort", &sort, "key/value sort.");
+}
+
+}  // namespace megablocks
diff --git a/csrc/cuda_util.h b/csrc/cuda_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e1d5911a79390c1a1065702b71118e4c20b9f3
--- /dev/null
+++ b/csrc/cuda_util.h
@@ -0,0 +1,62 @@
+#ifndef BLOCKPARTY_CSRC_CUDA_UTIL_H_
+#define BLOCKPARTY_CSRC_CUDA_UTIL_H_
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// #include <torch/extension.h>
+
+namespace megablocks {
+
+typedef __half2 half2;
+
+struct __align__(8) half4 {
+  half2 x, y;
+};
+
+struct __align__(16) half8 {
+  half2 x, y, z, w;
+};
+
+template <class To, class From>
+__device__ __forceinline__ To BitCast(const From& src) noexcept {
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+
+template <typename T>
+__device__ __forceinline__ void Store(const T& value, T* ptr) {
+  *ptr = value;
+}
+
+template <typename T>
+__device__ __forceinline__ T Load(const T* address) {
+  return __ldg(address);
+}
+
+__device__ __forceinline__ half4 Load(const half4* address) {
+  float2 x = __ldg(reinterpret_cast<const float2*>(address));
+  return BitCast<half4>(x);
+}
+
+__device__ __forceinline__ half8 Load(const half8* address) {
+  float4 x = __ldg(reinterpret_cast<const float4*>(address));
+  return BitCast<half8>(x);
+}
+
+template <typename T>
+__device__ __forceinline__ T Zero() { return 0; };
+
+template <>
+__device__ __forceinline__ half2 Zero<half2>() {
+  return {(c10::Half)0., (c10::Half)0.};
+};
+
+template <>
+__device__ __forceinline__ half4 Zero<half4>() {
+  return {Zero<half2>(), Zero<half2>()};
+};
+
+}  // namespace megablocks
+
+#endif  // BLOCKPARTY_CSRC_CUDA_UTIL_H_
diff --git a/csrc/cumsum.h b/csrc/cumsum.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4db9676e82285d1f1177ac3fa071f349117bc2a
--- /dev/null
+++ b/csrc/cumsum.h
@@ -0,0 +1,163 @@
+#define CUB_IGNORE_DEPRECATED_API
+
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include <cstdint>
+
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+// #include <torch/extension.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+struct Inclusive {};
+struct Exclusive {};
+
+template <typename Type> struct Cumsum {
+
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+
+template <> struct Cumsum<Inclusive> {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::InclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+
+template <typename SumType, typename T>
+void cub_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Get temporary storage size.
+  size_t scratchpad_bytes = 0;
+  Cumsum<SumType>::Run(nullptr,
+		       scratchpad_bytes,
+		       x.data_ptr<T>(),
+		       out.data_ptr<T>(),
+		       x.size(1),
+		       c10::cuda::getCurrentCUDAStream());
+
+  // Allocate scratchpad.
+  //
+  // NOTE: We scale for the batch dimension so we can run in parallel.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes * x.size(0),
+  					  options);
+
+  // Run the kernel.
+  //
+  // NOTE: Using different streams for each issue does not appear to
+  // yield performance gains for our problem set. The overhead of
+  // event/stream synchronization appears to outweigh the benfits.
+  // We could write a true batched cumsum, but this would require
+  // significant code duplication from cub and we might move away
+  // from this formulation anyways.
+  for (int i = 0; i < x.size(0); ++i) {
+    void* scratchpad_ptr = (int8_t*)scratchpad.data_ptr() + scratchpad_bytes * i;
+    Cumsum<SumType>::Run(scratchpad_ptr,
+			 scratchpad_bytes,
+			 x.data_ptr<T>() + x.size(1) * i,
+			 out.data_ptr<T>() + x.size(1) * i,
+			 x.size(1),
+			 c10::cuda::getCurrentCUDAStream());
+  }
+}
+
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Exclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Exclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Exclusive, long>(x, dim, out);
+}
+
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Inclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Inclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Inclusive, long>(x, dim, out);
+}
+
+} // namespace megablocks
+
+#undef CUB_WRAPPED_NAMESPACE
\ No newline at end of file
diff --git a/csrc/grouped_gemm/fill_arguments.cuh b/csrc/grouped_gemm/fill_arguments.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..17d5825fcc85298cc2a525f812227487c351bb95
--- /dev/null
+++ b/csrc/grouped_gemm/fill_arguments.cuh
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <cub/cub.cuh>
+#include <cutlass/bfloat16.h>
+#include <cutlass/gemm_coord.h>
+
+namespace grouped_gemm {
+
+constexpr int kDynamicDim = -1;
+constexpr int kMaxExperts = 512;
+
+struct GemmProblem {
+  ::cutlass::gemm::GemmCoord dims;
+  int64_t lda, ldb, ldc;
+  // All offsets are in elements.
+  int64_t a_offset, b_offset, c_offset;
+};
+
+// TODO: revisit `ExtractGemmProblemK` struct
+// struct ExtractGemmProblemK {
+//   __device__ ::cuda::std::tuple<int&> operator()(GemmProblem& problem) const {
+//       return {problem.dims.k()};
+//   }
+// };
+
+template <
+    // If `k` is dynamic, we sort the problems by `k` in descending order.
+    // Otherwise, `m` is dynamic, and no sorting happens.
+    bool kDynamicK,
+    typename ElementA, typename ElementB, typename ElementC,
+    typename LayoutA, typename LayoutB, typename LayoutC,
+    typename Args
+>
+__global__ void FillArguments(
+    int num_experts, const int64_t* batch_sizes,
+    ElementA* ptr_a, ElementB* ptr_b, ElementC* ptr_c,
+    Args args, ::cutlass::gemm::GemmCoord dims
+) {
+  const int expert_idx = threadIdx.x;
+  const int batch_size = expert_idx < num_experts ? batch_sizes[expert_idx] : -1;
+
+  if (kDynamicK) {
+    assert(dims.k() == kDynamicDim);
+    dims.k() = batch_size;
+  } else {
+    assert(dims.m() == kDynamicDim);
+    dims.m() = batch_size;
+  }
+
+  using BlockScan = cub::BlockScan<int, kMaxExperts>;
+  using BlockSort = cub::BlockRadixSort<int, kMaxExperts, 1, GemmProblem>;
+
+  union SharedMemory {
+    typename BlockScan::TempStorage scan_storage;
+    typename BlockSort::TempStorage sort_storage;
+  };
+  __shared__ SharedMemory shared_memory;
+
+  int dynamic_dim = kDynamicK ? dims.k() : dims.m();
+  int dynamic_dim_cumsum;
+  BlockScan(shared_memory.scan_storage).ExclusiveSum(dynamic_dim, dynamic_dim_cumsum);
+  __syncthreads();
+
+  // We have to use `GemmProblem[1]` here instead of just `GemmProblem` because `SortDescending()` expects
+  // `KeyT (&)[ITEMS_PER_THREAD]` for the `keys` argument (i.e., `GemmProblem (&keys)[1]` in our case).
+  GemmProblem problem[1] = {
+    GemmProblem {
+      .dims = dims,
+      .lda = LayoutA::packed({dims.m(), dims.k()}).stride(0),
+      .ldb = LayoutB::packed({dims.k(), dims.n()}).stride(0),
+      .ldc = LayoutC::packed({dims.m(), dims.n()}).stride(0),
+      .a_offset = kDynamicK
+          ? (dims.m() * dynamic_dim_cumsum)
+          : (dynamic_dim_cumsum * dims.k()),
+      .b_offset = (kDynamicK ? dynamic_dim_cumsum : expert_idx * dims.k()) * dims.n(),
+      .c_offset = (kDynamicK ? expert_idx * dims.m() : dynamic_dim_cumsum) * dims.n(),
+    },
+  };
+
+  if constexpr (kDynamicK) {
+    // Sort by k dimension in descending order
+    // We need to extract the key (k value) for sorting
+    int k_keys[1] = { problem[0].dims.k() };
+    
+    BlockSort(shared_memory.sort_storage).SortDescending(k_keys, problem);
+    
+    // TODO: revisit original impl without `__syncthreads()`
+    // BlockSort(shared_memory.sort_storage).SortDescending(problem, ExtractGemmProblemK{});
+    // Quoting the CUB documentation (https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockRadixSort.html):
+    // > A subsequent __syncthreads() threadblock barrier should be invoked after calling this method if the collective’s temporary storage [...]
+    // > is **to be reused or repurposed**.
+    // We don't need `__syncthreads()` here, since we don't do either of these things.
+  }
+
+  if (expert_idx < num_experts) {
+    args.problem_sizes[expert_idx] = problem[0].dims;
+    args.lda[expert_idx] = problem[0].lda;
+    args.ldb[expert_idx] = problem[0].ldb;
+    args.ldc[expert_idx] = problem[0].ldc;
+
+    args.ptr_A[expert_idx] = ptr_a + problem[0].a_offset;
+    args.ptr_B[expert_idx] = ptr_b + problem[0].b_offset;
+    args.ptr_C[expert_idx] = ptr_c + problem[0].c_offset;
+  }
+}
+
+template <typename Args>
+__global__ void ZeroOutK0Outputs(int num_experts, Args args) {
+  const int64_t start_idx = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t delta     = (int64_t)gridDim.x * blockDim.x;
+  for (int ei = 0; ei < num_experts; ++ei) {
+    auto& dims = args.problem_sizes[ei];
+    // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+    // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+    //   * (here) set the output to zero
+    //   * (in `IgnoreK0Problems`) make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+    if (dims.k() == 0) {
+      // Assume packed layout, run a grid-strided loop over the output.
+      int64_t total_elems = (int64_t)dims.m() * dims.n();
+      auto* out           = args.ptr_C[ei];
+      for (int64_t idx = start_idx; idx < total_elems; idx += delta) {
+        out[idx] = {};
+      }
+    }
+  }
+}
+
+template <typename Args>
+__global__ void IgnoreK0Problems(int num_experts, Args args) {
+  const int expert_idx = threadIdx.x;
+  if (expert_idx < num_experts) {
+    auto& dims = args.problem_sizes[expert_idx];
+    if (dims.k() == 0) {
+      dims.m() = 0;
+      dims.n() = 0;
+    }
+  }
+}
+
+}  // namespace grouped_gemm
diff --git a/csrc/grouped_gemm/grouped_gemm.cu b/csrc/grouped_gemm/grouped_gemm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0aeec0dfbc0ccc8d97d4276dac627c94f6539276
--- /dev/null
+++ b/csrc/grouped_gemm/grouped_gemm.cu
@@ -0,0 +1,567 @@
+#include "grouped_gemm.h"
+#include "fill_arguments.cuh"
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <c10/util/BFloat16.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cub/cub.cuh>
+#include <torch/torch.h>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/complex.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include <type_traits>
+
+namespace grouped_gemm {
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+#define CUBLAS_CALL(code)					  \
+  do {								  \
+    cublasStatus_t status = code;				  \
+    TORCH_CHECK(status == CUBLAS_STATUS_SUCCESS, "CuBLAS Error"); \
+  } while (0)
+
+#define GROUPED_GEMM_STRINGIFY_HELPER(x) #x
+#define GROUPED_GEMM_STRINGIFY(x) \
+  GROUPED_GEMM_STRINGIFY_HELPER(x)
+
+template <bool trans>
+using GroupedGemmInputLayout = std::conditional_t<trans, ::cutlass::layout::ColumnMajor, ::cutlass::layout::RowMajor>;
+
+using GroupedGemmConfig = ::cutlass::gemm::device::DefaultGemmConfiguration<
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  float
+>;
+
+// TODO(tgale): Update this for SM90 when it's supported by CUTLASS.
+template <bool trans_a, bool trans_b>
+using GroupedGemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+  // A operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_a>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentA,
+  // B operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_b>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentB,
+  // C operand.
+  ::cutlass::bfloat16_t,
+  ::cutlass::layout::RowMajor,
+  float,
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  GroupedGemmConfig::ThreadblockShape,
+  GroupedGemmConfig::WarpShape,
+  GroupedGemmConfig::InstructionShape,
+  GroupedGemmConfig::EpilogueOutputOp,
+  // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+  // This parameter is passed in at present to match the APIs of other kernels. The parameter
+  // is unused within the kernel.
+  ::cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+  // TODO(tgale): Tune this for SM90.
+  GroupedGemmConfig::kStages>::GemmKernel;
+
+template <bool trans_a, bool trans_b>
+using GemmGrouped = ::cutlass::gemm::device::GemmGrouped<GroupedGemmKernel<trans_a, trans_b>>;
+
+template <typename T>
+torch::Tensor CopyToDevice(const std::vector<T> &x, const torch::Device &device) {
+  size_t bytes = x.size() * sizeof(T);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(device);
+  torch::Tensor out = torch::empty(bytes, options);
+
+  CUDA_CALL(cudaMemcpyAsync(out.data_ptr(),
+			    x.data(), bytes,
+			    cudaMemcpyHostToDevice,
+			    c10::cuda::getCurrentCUDAStream()));
+  return out;
+}
+
+template <typename T>
+static void ReorderArray(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(data, data + indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+        data[i] = copy.at(indices[i]);
+    }
+}
+
+template <typename T>
+torch::Tensor TypedEmpty(size_t numel, const torch::Device& device) {
+    return torch::empty(numel * sizeof(T), torch::dtype(torch::kInt8).device(device));
+}
+
+struct RawGemmArguments {
+  torch::Tensor lda, ldb, ldc, ptr_a, ptr_b, ptr_c, problem_sizes;
+  int threadblock_count{};
+};
+
+template <
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC
+>
+RawGemmArguments MakeArgumentsOnDevice(int num_experts, const torch::Device& device) {
+    TORCH_CHECK(
+        num_experts <= kMaxExperts,
+        "At most ", kMaxExperts,
+        " experts are supported when batch_sizes is a CUDA tensor, but got ", num_experts
+    );
+
+    return RawGemmArguments {
+      .lda = TypedEmpty<int64_t>(num_experts, device),
+      .ldb = TypedEmpty<int64_t>(num_experts, device),
+      .ldc = TypedEmpty<int64_t>(num_experts, device),
+      .ptr_a = TypedEmpty<ElementA*>(num_experts, device),
+      .ptr_b = TypedEmpty<ElementB*>(num_experts, device),
+      .ptr_c = TypedEmpty<ElementC*>(num_experts, device),
+      .problem_sizes = TypedEmpty<cutlass::gemm::GemmCoord>(num_experts, device),
+
+      // We don't know the problem dimensions on the host, so we just base the number of threadblocks on occupancy here.
+      .threadblock_count = Gemm::sufficient(),
+    };
+}
+
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+RawGemmArguments MakeArgumentsOnHost(torch::Tensor a,
+				     torch::Tensor b,
+				     torch::Tensor c,
+				     torch::Tensor batch_sizes,
+				     ::cutlass::gemm::GemmCoord coord_template,
+				     int64_t num_experts) {
+  std::vector<::cutlass::gemm::GemmCoord> problem_sizes_host(num_experts);
+
+  // Create the host arrays of leading dimension data and pointer data.
+  std::vector<int64_t> lda_host(num_experts), ldb_host(num_experts), ldc_host(num_experts);
+  int64_t elements_a = 0, elements_b = 0, elements_c = 0;
+
+  std::vector<ElementA *> ptr_a_host(num_experts), ptr_b_host(num_experts), ptr_c_host(num_experts);
+
+  for (int i = 0; i < num_experts; ++i) {
+    auto& problem = problem_sizes_host[i];
+    problem = coord_template;
+    (kDynamicK ? problem.k() : problem.m()) = batch_sizes.data_ptr<int64_t>()[i];
+
+    lda_host[i] = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+    ldb_host[i] = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+    ldc_host[i] = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+
+    ptr_a_host[i] = (ElementA*)a.data_ptr() + elements_a;
+    ptr_b_host[i] = (ElementB*)b.data_ptr() + elements_b;
+    ptr_c_host[i] = (ElementC*)c.data_ptr() + elements_c;
+
+    elements_a += problem.m() * problem.k();
+    elements_b += problem.k() * problem.n();
+    elements_c += problem.m() * problem.n();
+
+    if (problem.k() == 0) {
+      // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+      // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+      //   * set the output to zero with `cudaMemsetAsync()`
+      //   * make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+      CUDA_CALL(cudaMemsetAsync(ptr_c_host[i],
+        0,
+        problem.m() * problem.n() * sizeof(ElementC),
+        c10::cuda::getCurrentCUDAStream()));
+
+      problem.m() = 0;
+      problem.n() = 0;
+    }
+  }
+
+  // Only sort problems when K are different
+  if (kDynamicK) {
+      std::vector<size_t> indices(num_experts);
+      std::iota(indices.begin(), indices.end(), 0);
+      std::stable_sort(indices.begin(), indices.end(), [&problem_sizes_host](size_t i, size_t j) {
+          return problem_sizes_host[i].k() > problem_sizes_host[j].k();
+      });
+
+      ReorderArray(problem_sizes_host.data(), indices);
+      ReorderArray(lda_host.data(), indices);
+      ReorderArray(ldb_host.data(), indices);
+      ReorderArray(ldc_host.data(), indices);
+      ReorderArray(ptr_a_host.data(), indices);
+      ReorderArray(ptr_b_host.data(), indices);
+      ReorderArray(ptr_c_host.data(), indices);
+  }
+
+  // Copy the problem sizes, pointers and leading dimension data to the device.
+  return RawGemmArguments {
+    .lda = CopyToDevice(lda_host, a.device()),
+    .ldb = CopyToDevice(ldb_host, a.device()),
+    .ldc = CopyToDevice(ldc_host, a.device()),
+    .ptr_a = CopyToDevice(ptr_a_host, a.device()),
+    .ptr_b = CopyToDevice(ptr_b_host, a.device()),
+    .ptr_c = CopyToDevice(ptr_c_host, a.device()),
+    .problem_sizes = CopyToDevice(problem_sizes_host, a.device()),
+
+    // We know the problem dimensions on the host, so we can calculate the number of threadblocks based on that.
+    .threadblock_count = Gemm::sufficient(problem_sizes_host.data(), num_experts),
+  };
+}
+
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+typename Gemm::Arguments MakeArguments(torch::Tensor a,
+				       torch::Tensor b,
+				       torch::Tensor c,
+				       torch::Tensor batch_sizes,
+				       ::cutlass::gemm::GemmCoord coord_template,
+				       int64_t num_experts) {
+  RawGemmArguments raw_args;
+  if (batch_sizes.is_cuda()) {
+    raw_args = MakeArgumentsOnDevice<
+      Gemm, ElementA, ElementB, ElementC
+    >(num_experts, a.device());
+  } else {
+    raw_args = MakeArgumentsOnHost<
+      kDynamicK,
+      Gemm,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+    >(a, b, c, batch_sizes, coord_template, num_experts);
+  }
+
+  printf("Using %d threadblocks for grouped GEMM.\n", raw_args.threadblock_count);
+  // Validate the result.
+  if (!raw_args.threadblock_count) {
+    TORCH_CHECK(false, "Grouped GEMM execution not possible with HW");
+  }
+
+  typename Gemm::EpilogueOutputOp::Params epilogue_op(/*alpha=*/1.0f, /*beta=*/0.0f);
+  // We currently always use `GroupScheduleMode::kDeviceOnly`, which doesn't use `host_problem_sizes` at all,
+  // so we can safely pass `nullptr` for `host_problem_sizes`.
+  // TODO(tgale): Experiment with `GroupScheduleMode::kHostPrecompute` for `batch_sizes.is_cpu()`, where we
+  // know the problem dimensions on the host.
+  typename Gemm::Arguments arguments((cutlass::gemm::GemmCoord*)raw_args.problem_sizes.data_ptr(),
+				     (int)num_experts,
+				     (int)raw_args.threadblock_count,
+				     epilogue_op,
+				     (ElementA**)raw_args.ptr_a.data_ptr(),
+				     (ElementB**)raw_args.ptr_b.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     /*lda=*/(int64_t*)raw_args.lda.data_ptr(),
+				     /*ldb=*/(int64_t*)raw_args.ldb.data_ptr(),
+				     /*ldc=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*ldd=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*host_problem_sizes=*/nullptr);
+  return arguments;
+}
+
+template <
+  bool trans_a,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC,
+  typename Arguments
+>
+void FillCutlassArguments(int num_experts,
+			  torch::Tensor batch_sizes,
+			  torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  const Arguments& arguments,
+			  ::cutlass::gemm::GemmCoord coord_template) {
+  // Convert the batch sizes to the format CUTLASS understands on the device.
+  // Use a single block here because:
+  //   * the number of elements to process is microscopically small
+  //   * we don't need any additional global memory
+  FillArguments<
+      /*kDynamicK*/trans_a,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+  ><<<1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()>>>(
+      num_experts, batch_sizes.data_ptr<int64_t>(),
+      (ElementA*)a.data_ptr(), (ElementB*)b.data_ptr(), (ElementC*)c.data_ptr(),
+      arguments, coord_template
+  );
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename Args>
+void RemoveK0Problems(int num_experts, const Args& arguments) {
+  // For zeroing out the outputs (which might be arbitrarily large), we want to use
+  // as many threadblocks as possible in order to hit the maximum possible global memory bandwidth.
+  // `arguments.threadblock_count`, which we will use for the grouped GEMM proper,
+  // should be a good approximation for this.
+  // When the `k=0` case is fixed in CUTLASS, we can completely remove this function.
+  ZeroOutK0Outputs<><<<
+    arguments.threadblock_count, at::cuda::detail::CUDA_NUM_THREADS, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+  IgnoreK0Problems<><<<
+    1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+}
+
+template <bool trans_a, bool trans_b>
+torch::Tensor CutlassGroupedGemm(torch::Tensor a,
+				 torch::Tensor b,
+				 torch::Tensor c,
+				 torch::Tensor batch_sizes,
+				 ::cutlass::gemm::GemmCoord coord_template) {
+  using Gemm = GemmGrouped<trans_a, trans_b>;
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+
+  Gemm gemm;
+  int64_t num_experts = batch_sizes.size(0);
+  auto arguments = MakeArguments<
+    /*kDynamicK*/trans_a,
+    Gemm,
+    ElementA, ElementB, ElementC,
+    LayoutA, LayoutB, LayoutC
+  >(a, b, c, batch_sizes, coord_template, num_experts);
+  int64_t workspace_size = gemm.get_workspace_size(arguments);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(a.device());
+  torch::Tensor workspace = torch::empty(workspace_size, options);
+
+  if (batch_sizes.is_cuda()) {
+      FillCutlassArguments<
+        trans_a,
+        ElementA, ElementB, ElementC,
+        LayoutA, LayoutB, LayoutC
+      >(num_experts, batch_sizes, a, b, c, arguments, coord_template);
+
+      RemoveK0Problems<>(num_experts, arguments);
+  }
+
+  // Initialize the kernel.
+  if(gemm.initialize(arguments, workspace.data_ptr()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to initialize CUTLASS Grouped GEMM");
+  }
+
+  // Execute the kernel in the current stream.
+  if(gemm.run(c10::cuda::getCurrentCUDAStream()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to run CUTLASS Grouped GEMM");
+  }
+  return c;
+}
+
+void CublasGemm(c10::BFloat16 *a, int64_t a_rows, int64_t a_cols, bool trans_a,
+		c10::BFloat16 *b, int64_t b_rows, int64_t b_cols, bool trans_b,
+		c10::BFloat16 *c, int64_t c_rows, int64_t c_cols) {
+  int m = trans_b ? b_rows : b_cols;
+  int k = trans_b ? b_cols : b_rows;
+  int n = trans_a ? a_cols : a_rows;
+
+  int lda = trans_a ? n : k;
+  int ldb = trans_b ? k : m;
+  cublasOperation_t transpose_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t transpose_b = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  float alpha = 1.0, beta = 0.0;
+  CUBLAS_CALL(cublasGemmEx(at::cuda::getCurrentCUDABlasHandle(),
+			   transpose_b, transpose_a,
+			   m, n, k, &alpha,
+			   b, CUDA_R_16BF, ldb,
+			   a, CUDA_R_16BF, lda,
+			   &beta,
+			   c, CUDA_R_16BF, c_cols, CUDA_R_32F,
+			   CUBLAS_GEMM_DEFAULT));
+}
+
+void CublasGroupedGemm(torch::Tensor a,
+		       torch::Tensor b,
+		       torch::Tensor c,
+		       torch::Tensor batch_sizes,
+		       bool trans_b) {
+  int64_t bs = batch_sizes.size(0), k = a.size(1);
+  int64_t n = trans_b ? b.size(1) : b.size(2);
+  int64_t b_rows = b.size(1), b_cols = b.size(2);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t m = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, m, k, /*trans_a=*/false,
+	       b_ptr, b_rows, b_cols, trans_b,
+	       c_ptr, m, n);
+    a_ptr += m * k;
+    b_ptr += b_rows * b_cols;
+    c_ptr += m * n;
+  }
+}
+
+void CublasGroupedGemmVariableK(torch::Tensor a,
+				torch::Tensor b,
+				torch::Tensor c,
+				torch::Tensor batch_sizes) {
+  int64_t bs = batch_sizes.size(0), m = a.size(1), n = b.size(1);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t k = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, k, m, /*trans_a=*/true,
+	       b_ptr, k, n, /*trans_b=*/false,
+	       c_ptr, m, n);
+    a_ptr += k * m;
+    b_ptr += k * n;
+    c_ptr += m * n;
+  }
+}
+
+void GroupedGemmVariableK(torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  torch::Tensor batch_sizes) {
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_out) for 'b'.
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.ndimension() == 2);
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+
+  // Validate the dimensions.
+  int64_t tokens = a.size(0), num_experts = batch_sizes.size(0);
+  int64_t m = a.size(1), n = b.size(1);
+
+  // Validate that we have the same contraction dimension.
+  TORCH_CHECK(tokens == b.size(0));
+
+  // Validate the output shape.
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(c.ndimension() == 3);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.size(0) == num_experts);
+  TORCH_CHECK(c.size(1) == m);
+  TORCH_CHECK(c.size(2) == n);
+
+  // Run the computation.
+  CublasGroupedGemmVariableK(a, b, c, batch_sizes);
+}
+
+// NOTE: We only support dynamic group sizes for the 'a' tensor. Tensor 'b' is
+// assumed to be batched with fixed sized batches.
+//
+// TODO(tgale): Validate alignment is true for every batch element.
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b) {
+  // NOTE: We only support 'trans_a' or 'trans_b', not both.
+  TORCH_CHECK(!(trans_a && trans_b));
+
+#if !defined(GROUPED_GEMM_CUTLASS)
+  // No way to run cuBLAS kernels if the problem dimensions are not known on the host.
+  TORCH_CHECK(batch_sizes.is_cpu());
+#else
+  // CUTLASS can handle both CPU- and CUDA-resident problem dimensions.
+  TORCH_CHECK(batch_sizes.is_cuda() || batch_sizes.is_cpu());
+#endif
+  TORCH_CHECK(batch_sizes.ndimension() == 1);
+  TORCH_CHECK(batch_sizes.scalar_type() == torch::kInt64);
+
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_in) for 'a'.
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.ndimension() == 2);
+  TORCH_CHECK(a.scalar_type() == torch::kBFloat16);
+
+#if !defined(GROUPED_GEMM_CUTLASS)
+  if (trans_a) {
+    // If we can't use CUTLASS for the transposed cases, defer to the variable 'k' helper using cuBLAS
+    // for the rest of the op.
+    GroupedGemmVariableK(a, b, c, batch_sizes);
+    return;
+  }
+#endif
+
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+
+  // The expected shapes of 'b' and 'c' are:
+  //   * when 'trans_a' is set: b=(tokens, hidden_out),                 c=(num_experts, hidden_in, hidden_out)
+  //   * when 'trans_b' is set: b=(num_experts, hidden_out, hidden_in), c=(tokens, hidden_out)
+  //   * otherwise:             b=(num_experts, hidden_in, hidden_out), c=(tokens, hidden
+  size_t hidden_in{}, hidden_out{};
+  if (trans_a) {
+    hidden_in = a.size(1);
+    hidden_out = b.size(1);
+
+    TORCH_CHECK(b.ndimension() == 2);
+    TORCH_CHECK(c.ndimension() == 3);
+    TORCH_CHECK(b.size(0) == a.size(0));
+    TORCH_CHECK(c.size(0) == batch_sizes.size(0));
+    TORCH_CHECK(c.size(1) == hidden_in);
+    TORCH_CHECK(c.size(2) == hidden_out);
+  } else {
+    TORCH_CHECK(b.ndimension() == 3);
+    TORCH_CHECK(c.ndimension() == 2);
+
+    // Validate the contraction dimensions match.
+    int64_t tokens = a.size(0), num_experts = b.size(0);
+    hidden_in = trans_b ? b.size(2) : b.size(1);
+    hidden_out = trans_b ? b.size(1) : b.size(2);
+    TORCH_CHECK(hidden_in == a.size(1));
+
+    // Validate that we have one size per expert.
+    TORCH_CHECK(batch_sizes.size(0) == num_experts);
+  }
+
+  // NOTE: We support transposition through the 'trans_b' flag.
+  TORCH_CHECK(a.is_contiguous());
+  TORCH_CHECK(b.is_contiguous());
+  TORCH_CHECK(c.is_contiguous());
+
+#if !defined(GROUPED_GEMM_CUTLASS)
+  CublasGroupedGemm(a, b, c, batch_sizes, trans_b);
+  return;
+#else
+  // The `coord_template` argument contains `kDynamicDim` as one of its dimensions
+  // as a placeholder. This placeholder is later expanded into the actual dimension
+  // for every element of the batch,  either on the host or on the device
+  // (if we can't do in on the host).
+  const auto coord_template = trans_a
+    ? cutlass::gemm::GemmCoord(hidden_in, hidden_out, kDynamicDim)
+    : cutlass::gemm::GemmCoord(kDynamicDim, hidden_out, hidden_in);
+  if (trans_a) {
+    CutlassGroupedGemm<true, false>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  if (trans_b) {
+    CutlassGroupedGemm<false, true>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  CutlassGroupedGemm<false, false>(a, b, c, batch_sizes, coord_template);
+  return;
+#endif
+}
+
+}  // namespace grouped_gemm
diff --git a/csrc/grouped_gemm/grouped_gemm.h b/csrc/grouped_gemm/grouped_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..87f8adc20b1a95da082731558735729124b0b526
--- /dev/null
+++ b/csrc/grouped_gemm/grouped_gemm.h
@@ -0,0 +1,20 @@
+#pragma once
+
+// // Set default if not already defined
+// #ifndef GROUPED_GEMM_CUTLASS
+// #define GROUPED_GEMM_CUTLASS 0
+// #endif
+
+// #include <torch/extension.h>
+#include <torch/torch.h>
+
+namespace grouped_gemm {
+
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b);
+
+}  // namespace grouped_gemm
+
diff --git a/csrc/grouped_gemm/ops.cu b/csrc/grouped_gemm/ops.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ea2a253acc45d07fc581c05fea111ee1b394fda
--- /dev/null
+++ b/csrc/grouped_gemm/ops.cu
@@ -0,0 +1,11 @@
+#include "grouped_gemm.h"
+
+#include <torch/extension.h>
+
+namespace grouped_gemm {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gmm", &GroupedGemm, "Grouped GEMM.");
+}
+
+}  // namespace grouped_gemm
diff --git a/csrc/histogram.h b/csrc/histogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..161115b82fa37b86fadbf19dd64d773ff43d0d1a
--- /dev/null
+++ b/csrc/histogram.h
@@ -0,0 +1,86 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include <cstdint>
+
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+template <typename T>
+torch::Tensor cub_histogram(torch::Tensor x, int num_bins) {
+  // Allocate the count buffer.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt32)
+    .device(x.device());
+  torch::Tensor out = torch::empty({x.size(0), num_bins}, options);
+
+  // Exit early if there is not work to do.
+  if (out.numel() == 0) return out;
+
+  // Get scratchpad size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceHistogram::HistogramEven(nullptr,
+						scratchpad_bytes,
+						x.data_ptr<T>(),
+						out.data_ptr<int>(),
+						/*num_levels=*/num_bins + 1,
+						/*lower_level=*/0,
+						/*upper_level=*/num_bins,
+						/*num_samples=*/int(x.size(1)),
+						c10::cuda::getCurrentCUDAStream()));
+
+  // Allocate scratchpad.
+  options = torch::TensorOptions().dtype(torch::kInt8).device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel.
+  for (int i = 0; i < x.size(0); ++i) {
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(scratchpad.data_ptr(),
+						  scratchpad_bytes,
+						  x.data_ptr<T>() + x.size(1) * i,
+						  out.data_ptr<int>() + out.size(1) * i,
+						  /*num_levels=*/num_bins + 1,
+						  /*lower_level=*/0,
+						  /*upper_level=*/num_bins,
+						  /*num_samples=*/int(x.size(1)),
+						  c10::cuda::getCurrentCUDAStream()));
+  }
+  return out;
+}
+
+torch::Tensor histogram(torch::Tensor x, int num_bins) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1 || x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  bool no_batch = x.ndimension() == 1;
+  if (no_batch) x = x.view({1, x.numel()});
+
+  if (x.scalar_type() == torch::kInt16) {
+    auto out = cub_histogram<short>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else if (x.scalar_type() == torch::kInt32) {
+    auto out = cub_histogram<int>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else {
+    TORCH_CHECK(x.scalar_type() == torch::kInt64);
+    auto out = cub_histogram<long>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  }
+}
+
+}  // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
diff --git a/csrc/indices.h b/csrc/indices.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2f0d2d6a90e46e2468221f253eb787105aa2b93
--- /dev/null
+++ b/csrc/indices.h
@@ -0,0 +1,95 @@
+#include <cstdint>
+#include <c10/util/Half.h>
+// #include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+namespace construct_indices {
+
+// We expect the number of outputs per block to be small. For
+// example, with ffn_hidden_size=4096, we only need to write
+// 32 elements per block per iteration.
+const int kThreadsPerBlock = 32;
+
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ConstructIndicesKernel(short * __restrict__ indices,
+			 int num_columns,
+			 int block_size,
+			 const int * __restrict__ padded_bins) {
+  // Load the offset for this bins indices.
+  int start = 0;
+  if (blockIdx.x > 0) start = __ldg(padded_bins + blockIdx.x - 1);
+  int end = __ldg(padded_bins + blockIdx.x);
+
+  // Divide the start and end into blocks.
+  start /= block_size;
+  end /= block_size;
+
+  // Offset the output buffer to the start of the bin.
+  indices += (start + blockIdx.y) * num_columns + threadIdx.x;
+
+  // Write the indices to the output.
+  int bin_offset = blockIdx.y;
+  int num_rows = end - start;
+  for (; bin_offset < num_rows; num_rows -= gridDim.y) {
+    short *out = indices;
+    for (int bid = threadIdx.x; bid < num_columns; bid += kThreadsPerBlock) {
+      *out = bid + (blockIdx.x * num_columns);
+      out += kThreadsPerBlock;
+    }
+    indices += gridDim.y * num_columns;
+  }
+}
+
+cudaError_t ConstructIndices(short * __restrict__ indices,
+			     int output_block_rows,
+			     int output_block_columns,
+			     int block_size,
+			     const int * __restrict__ padded_bins,
+			     int num_bins,
+			     cudaStream_t stream) {
+  dim3 block_dim(kThreadsPerBlock);
+  dim3 grid_dim(num_bins, (int)std::ceil((float)output_block_rows / num_bins));
+  ConstructIndicesKernel<<<grid_dim, block_dim, 0, stream>>>(indices,
+							     output_block_columns,
+							     block_size,
+							     padded_bins);
+  return cudaGetLastError();
+}
+
+}  // namespace construct_indices
+
+void indices(torch::Tensor padded_bins,
+	     int block_size,
+	     int output_block_rows,
+	     int output_block_columns,
+	     torch::Tensor out) {
+  TORCH_CHECK(padded_bins.is_cuda());
+  TORCH_CHECK(padded_bins.ndimension() == 1);
+  TORCH_CHECK(padded_bins.scalar_type() == torch::kInt);
+
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 1);
+  TORCH_CHECK(out.scalar_type() == torch::kInt16);
+  TORCH_CHECK(out.numel() == (output_block_rows * output_block_columns));
+
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+
+  CUDA_CALL(construct_indices::ConstructIndices(out.data_ptr<short>(),
+						output_block_rows,
+						output_block_columns,
+						block_size,
+						padded_bins.data_ptr<int>(),
+						padded_bins.numel(),
+						c10::cuda::getCurrentCUDAStream()));
+}
+
+}  // namespace megablocks
diff --git a/csrc/new_cumsum.cu b/csrc/new_cumsum.cu
new file mode 100644
index 0000000000000000000000000000000000000000..175fdc91779435355d138ffb87dd0c244d0e2da8
--- /dev/null
+++ b/csrc/new_cumsum.cu
@@ -0,0 +1,161 @@
+#define CUB_IGNORE_DEPRECATED_API
+
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include "new_cumsum.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+struct Inclusive {};
+struct Exclusive {};
+
+template <typename Type> struct Cumsum {
+
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+
+template <> struct Cumsum<Inclusive> {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::InclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+
+template <typename SumType, typename T>
+void cub_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Get temporary storage size.
+  size_t scratchpad_bytes = 0;
+  Cumsum<SumType>::Run(nullptr,
+		       scratchpad_bytes,
+		       x.data_ptr<T>(),
+		       out.data_ptr<T>(),
+		       x.size(1),
+		       c10::cuda::getCurrentCUDAStream());
+
+  // Allocate scratchpad.
+  //
+  // NOTE: We scale for the batch dimension so we can run in parallel.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes * x.size(0),
+  					  options);
+
+  // Run the kernel.
+  //
+  // NOTE: Using different streams for each issue does not appear to
+  // yield performance gains for our problem set. The overhead of
+  // event/stream synchronization appears to outweigh the benfits.
+  // We could write a true batched cumsum, but this would require
+  // significant code duplication from cub and we might move away
+  // from this formulation anyways.
+  for (int i = 0; i < x.size(0); ++i) {
+    void* scratchpad_ptr = (int8_t*)scratchpad.data_ptr() + scratchpad_bytes * i;
+    Cumsum<SumType>::Run(scratchpad_ptr,
+			 scratchpad_bytes,
+			 x.data_ptr<T>() + x.size(1) * i,
+			 out.data_ptr<T>() + x.size(1) * i,
+			 x.size(1),
+			 c10::cuda::getCurrentCUDAStream());
+  }
+}
+
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Exclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Exclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Exclusive, long>(x, dim, out);
+}
+
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Inclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Inclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Inclusive, long>(x, dim, out);
+}
+
+} // namespace megablocks
+
+#undef CUB_WRAPPED_NAMESPACE
\ No newline at end of file
diff --git a/csrc/new_cumsum.h b/csrc/new_cumsum.h
new file mode 100644
index 0000000000000000000000000000000000000000..b18f282398b3bb34577a8c1faf97455bd504ceb5
--- /dev/null
+++ b/csrc/new_cumsum.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace megablocks {
+
+// Forward declarations for the public interface functions
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out);
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out);
+
+} // namespace megablocks
\ No newline at end of file
diff --git a/csrc/new_histogram.cu b/csrc/new_histogram.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cabd6a7a014407e23a4286fa48bd43341987f1c6
--- /dev/null
+++ b/csrc/new_histogram.cu
@@ -0,0 +1,85 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include "new_histogram.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+template <typename T>
+torch::Tensor cub_histogram(torch::Tensor x, int num_bins) {
+  // Allocate the count buffer.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt32)
+    .device(x.device());
+  torch::Tensor out = torch::empty({x.size(0), num_bins}, options);
+
+  // Exit early if there is not work to do.
+  if (out.numel() == 0) return out;
+
+  // Get scratchpad size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceHistogram::HistogramEven(nullptr,
+						scratchpad_bytes,
+						x.data_ptr<T>(),
+						out.data_ptr<int>(),
+						/*num_levels=*/num_bins + 1,
+						/*lower_level=*/0,
+						/*upper_level=*/num_bins,
+						/*num_samples=*/int(x.size(1)),
+						c10::cuda::getCurrentCUDAStream()));
+
+  // Allocate scratchpad.
+  options = torch::TensorOptions().dtype(torch::kInt8).device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel.
+  for (int i = 0; i < x.size(0); ++i) {
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(scratchpad.data_ptr(),
+						  scratchpad_bytes,
+						  x.data_ptr<T>() + x.size(1) * i,
+						  out.data_ptr<int>() + out.size(1) * i,
+						  /*num_levels=*/num_bins + 1,
+						  /*lower_level=*/0,
+						  /*upper_level=*/num_bins,
+						  /*num_samples=*/int(x.size(1)),
+						  c10::cuda::getCurrentCUDAStream()));
+  }
+  return out;
+}
+
+torch::Tensor histogram(torch::Tensor x, int num_bins) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1 || x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  bool no_batch = x.ndimension() == 1;
+  if (no_batch) x = x.view({1, x.numel()});
+
+  if (x.scalar_type() == torch::kInt16) {
+    auto out = cub_histogram<short>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else if (x.scalar_type() == torch::kInt32) {
+    auto out = cub_histogram<int>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else {
+    TORCH_CHECK(x.scalar_type() == torch::kInt64);
+    auto out = cub_histogram<long>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  }
+}
+
+} // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
\ No newline at end of file
diff --git a/csrc/new_histogram.h b/csrc/new_histogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d742dd8dc3ef2c33a660032f700746fb56de828
--- /dev/null
+++ b/csrc/new_histogram.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace megablocks {
+
+// Public interface function for computing histograms
+torch::Tensor histogram(torch::Tensor x, int num_bins);
+
+} // namespace megablocks
\ No newline at end of file
diff --git a/csrc/new_indices.cu b/csrc/new_indices.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1eef972c90868e17403542d173985a802b20308
--- /dev/null
+++ b/csrc/new_indices.cu
@@ -0,0 +1,97 @@
+#include "new_indices.h"
+#include <cstdint>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+namespace construct_indices {
+
+// We expect the number of outputs per block to be small. For
+// example, with ffn_hidden_size=4096, we only need to write
+// 32 elements per block per iteration.
+const int kThreadsPerBlock = 32;
+
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ConstructIndicesKernel(short * __restrict__ indices,
+			 int num_columns,
+			 int block_size,
+			 const int * __restrict__ padded_bins) {
+  // Load the offset for this bins indices.
+  int start = 0;
+  if (blockIdx.x > 0) start = __ldg(padded_bins + blockIdx.x - 1);
+  int end = __ldg(padded_bins + blockIdx.x);
+
+  // Divide the start and end into blocks.
+  start /= block_size;
+  end /= block_size;
+
+  // Offset the output buffer to the start of the bin.
+  indices += (start + blockIdx.y) * num_columns + threadIdx.x;
+
+  // Write the indices to the output.
+  int bin_offset = blockIdx.y;
+  int num_rows = end - start;
+  for (; bin_offset < num_rows; num_rows -= gridDim.y) {
+    short *out = indices;
+    for (int bid = threadIdx.x; bid < num_columns; bid += kThreadsPerBlock) {
+      *out = bid + (blockIdx.x * num_columns);
+      out += kThreadsPerBlock;
+    }
+    indices += gridDim.y * num_columns;
+  }
+}
+
+cudaError_t ConstructIndices(short * __restrict__ indices,
+			     int output_block_rows,
+			     int output_block_columns,
+			     int block_size,
+			     const int * __restrict__ padded_bins,
+			     int num_bins,
+			     cudaStream_t stream) {
+  dim3 block_dim(kThreadsPerBlock);
+  dim3 grid_dim(num_bins, (int)std::ceil((float)output_block_rows / num_bins));
+  ConstructIndicesKernel<<<grid_dim, block_dim, 0, stream>>>(indices,
+							     output_block_columns,
+							     block_size,
+							     padded_bins);
+  return cudaGetLastError();
+}
+
+} // namespace construct_indices
+
+void indices(torch::Tensor padded_bins,
+	     int block_size,
+	     int output_block_rows,
+	     int output_block_columns,
+	     torch::Tensor out) {
+  TORCH_CHECK(padded_bins.is_cuda());
+  TORCH_CHECK(padded_bins.ndimension() == 1);
+  TORCH_CHECK(padded_bins.scalar_type() == torch::kInt);
+
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 1);
+  TORCH_CHECK(out.scalar_type() == torch::kInt16);
+  TORCH_CHECK(out.numel() == (output_block_rows * output_block_columns));
+
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+
+  CUDA_CALL(construct_indices::ConstructIndices(out.data_ptr<short>(),
+						output_block_rows,
+						output_block_columns,
+						block_size,
+						padded_bins.data_ptr<int>(),
+						padded_bins.numel(),
+						c10::cuda::getCurrentCUDAStream()));
+}
+
+} // namespace megablocks
+
+#undef CUDA_CALL
\ No newline at end of file
diff --git a/csrc/new_indices.h b/csrc/new_indices.h
new file mode 100644
index 0000000000000000000000000000000000000000..8744303e52ee1e238cd7c903d3e968a0656d9422
--- /dev/null
+++ b/csrc/new_indices.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace megablocks {
+
+// Public interface function for constructing indices from padded bins
+void indices(torch::Tensor padded_bins,
+             int block_size,
+             int output_block_rows,
+             int output_block_columns,
+             torch::Tensor out);
+
+} // namespace megablocks
\ No newline at end of file
diff --git a/csrc/new_replicate.cu b/csrc/new_replicate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..db2a450ea7ecf8bcc4a3c6d3676ee4bd387d3d3a
--- /dev/null
+++ b/csrc/new_replicate.cu
@@ -0,0 +1,210 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include "new_replicate.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+namespace replicate {
+
+template <typename T, int kThreadsPerBlock>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ReplicateForwardKernel(T * __restrict__ x,
+			 int * __restrict__ bins,
+			 T * __restrict__ out,
+			 int columns) {
+  // Offset to this threadblocks batch.
+  //
+  // x is [batch_size, num_bins]
+  // out is [batch_size, columns]
+  // bins is [num_bins]
+  int batch_idx = blockIdx.y;
+  int num_bins = gridDim.x;
+  x += batch_idx * num_bins;
+  out += batch_idx * columns;
+
+  // Load the start/end for this bin.
+  int bin_idx = blockIdx.x;
+  int start = 0;
+  if (bin_idx > 0) start = __ldg(bins + bin_idx - 1);
+  int end = __ldg(bins + bin_idx);
+
+  // Load the value to replicate.
+  T value = __ldg((T*)x + bin_idx);
+
+  // Offset to this threadblocks bin and this threads
+  // offset within the bin.
+  int bin_offset = blockIdx.z * kThreadsPerBlock + threadIdx.x;
+  out += start + bin_offset;
+
+  // Replicate the value to the output.
+  //
+  // TODO(tgale): Vectorize these stores.
+  int num_elements = end - start;
+  const int kElementsPerLoop = gridDim.z * kThreadsPerBlock;
+  T *out_ptr = (T*)out;
+  for (; bin_offset < num_elements; num_elements -= kElementsPerLoop) {
+    *out_ptr = value;
+    out_ptr += kElementsPerLoop;
+  }
+}
+
+template <typename T>
+cudaError_t ReplicateForward(T *x,
+			     int batch_size,
+			     int num_bins,
+			     int *bins,
+			     T *out,
+			     int columns,
+			     cudaStream_t stream) {
+  const int kThreadsPerBlock = 64;
+  dim3 block_dim(kThreadsPerBlock, 1, 1);
+  int group_size = std::ceil((float)columns / (num_bins * kThreadsPerBlock));
+  dim3 grid_dim(num_bins, batch_size, group_size);
+  ReplicateForwardKernel<T, kThreadsPerBlock><<<
+    grid_dim, block_dim, 0, stream>>>(x, bins, out, columns);
+  return cudaGetLastError();
+}
+
+void cub_segmented_reduce(torch::Tensor grad,
+			  torch::Tensor bins,
+			  torch::Tensor out,
+			  cudaStream_t stream) {
+  // Append a zero to the bin boundaries for CUB.
+  torch::Tensor offsets = torch::empty(bins.numel() + 1, bins.options());
+  CUDA_CALL(cudaMemsetAsync(offsets.data_ptr<int>(),
+			    0,
+			    offsets.numel() * sizeof(int),
+			    stream));
+  CUDA_CALL(cudaMemcpyAsync(offsets.data_ptr<int>() + 1,
+			    bins.data_ptr<int>(),
+			    bins.numel() * sizeof(int),
+			    cudaMemcpyDeviceToDevice,
+			    stream));
+
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr,
+					    scratchpad_bytes,
+					    grad.data_ptr<c10::Half>(),
+					    out.data_ptr<c10::Half>(),
+					    bins.numel(),
+					    offsets.data_ptr<int>(),
+					    offsets.data_ptr<int>() + 1,
+					    stream));
+
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(grad.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel for each batch item.
+  for (int i = 0; i < grad.size(0); ++i) {
+    int num_bins = out.size(1);
+    int num_values = grad.size(1);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(scratchpad.data_ptr<int8_t>(),
+					      scratchpad_bytes,
+					      grad.data_ptr<c10::Half>() + i * num_values,
+					      out.data_ptr<c10::Half>() + i * num_bins,
+					      bins.numel(),
+					      offsets.data_ptr<int>(),
+					      offsets.data_ptr<int>() + 1,
+					      stream));
+  }
+}
+
+} // namespace replicate
+
+void replicate_forward(torch::Tensor x,
+		       torch::Tensor bins,
+		       torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kFloat16 ||
+	      x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(x.size(0) == out.size(0));
+
+  // One input for each bin (in each batch).
+  TORCH_CHECK(x.size(1) == bins.size(0));
+
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+
+  switch (x.scalar_type()) {
+  case torch::kFloat16:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<c10::Half>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<c10::Half>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  case torch::kInt32:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<int>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<int>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt16);
+  CUDA_CALL(replicate::ReplicateForward(x.data_ptr<short>(),
+					x.size(0),
+					x.size(1),
+					bins.data_ptr<int>(),
+					out.data_ptr<short>(),
+					out.size(1),
+					c10::cuda::getCurrentCUDAStream()));
+}
+
+void replicate_backward(torch::Tensor grad,
+			torch::Tensor bins,
+			torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(grad.is_cuda());
+  TORCH_CHECK(grad.ndimension() == 2);
+  TORCH_CHECK(grad.scalar_type() == torch::kFloat16);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == torch::kFloat16);
+
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(grad.size(0) == out.size(0));
+
+  // One output for each bin (in each batch).
+  TORCH_CHECK(out.size(1) == bins.size(0));
+
+  replicate::cub_segmented_reduce(grad, bins, out, c10::cuda::getCurrentCUDAStream());
+}
+
+} // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
\ No newline at end of file
diff --git a/csrc/new_replicate.h b/csrc/new_replicate.h
new file mode 100644
index 0000000000000000000000000000000000000000..2edb8c4c7415099c354963407dd8a5f6a1f1dc7f
--- /dev/null
+++ b/csrc/new_replicate.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace megablocks {
+
+// Forward pass: replicate values from x according to bin sizes
+void replicate_forward(torch::Tensor x,
+                       torch::Tensor bins,
+                       torch::Tensor out);
+
+// Backward pass: reduce gradients back to bins using segmented reduction
+void replicate_backward(torch::Tensor grad,
+                        torch::Tensor bins,
+                        torch::Tensor out);
+
+} // namespace megablocks
\ No newline at end of file
diff --git a/csrc/new_sort.cu b/csrc/new_sort.cu
new file mode 100644
index 0000000000000000000000000000000000000000..08a7a05566bac78bcbfafc0f18dc842a6983fd46
--- /dev/null
+++ b/csrc/new_sort.cu
@@ -0,0 +1,90 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include "new_sort.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+template <typename T>
+void cub_radix_sort(torch::Tensor x,
+		    int end_bit,
+		    torch::Tensor x_out,
+		    torch::Tensor iota_out) {
+  // Get iota for values in sort.
+  torch::Tensor iota = torch::arange(0, x.numel(), x.options());
+
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr,
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(scratchpad.data_ptr(),
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit=*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+}
+
+void sort(torch::Tensor x,
+	  int end_bit,
+	  torch::Tensor x_out,
+	  torch::Tensor iota_out) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+  	      x.scalar_type() == torch::kInt32 ||
+  	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(x_out.is_cuda());
+  TORCH_CHECK(x_out.ndimension() == 1);
+  TORCH_CHECK(x_out.scalar_type() == x.scalar_type());
+  TORCH_CHECK(iota_out.is_cuda());
+  TORCH_CHECK(iota_out.ndimension() == 1);
+  TORCH_CHECK(iota_out.scalar_type() == x.scalar_type());
+
+  // Exit early if there is not work to do.
+  if (x_out.numel() == 0) return;
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    return cub_radix_sort<short>(x, end_bit, x_out, iota_out);
+  case torch::kInt32:
+    return cub_radix_sort<int>(x, end_bit, x_out, iota_out);
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  return cub_radix_sort<long>(x, end_bit, x_out, iota_out);
+}
+
+} // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
\ No newline at end of file
diff --git a/csrc/new_sort.h b/csrc/new_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fa05fccc3c7d4f1581c73c9bd5bd48d10705f84
--- /dev/null
+++ b/csrc/new_sort.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/all.h>
+
+namespace megablocks {
+
+// Public interface function for radix sorting with indices
+void sort(torch::Tensor x,
+          int end_bit,
+          torch::Tensor x_out,
+          torch::Tensor iota_out);
+
+} // namespace megablocks
\ No newline at end of file
diff --git a/csrc/replicate.h b/csrc/replicate.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c857e68a7ad54d3bb26b6c307b6babd879f71bd
--- /dev/null
+++ b/csrc/replicate.h
@@ -0,0 +1,211 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include <cstdint>
+
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+namespace replicate {
+
+template <typename T, int kThreadsPerBlock>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ReplicateForwardKernel(T * __restrict__ x,
+			 int * __restrict__ bins,
+			 T * __restrict__ out,
+			 int columns) {
+  // Offset to this threadblocks batch.
+  //
+  // x is [batch_size, num_bins]
+  // out is [batch_size, columns]
+  // bins is [num_bins]
+  int batch_idx = blockIdx.y;
+  int num_bins = gridDim.x;
+  x += batch_idx * num_bins;
+  out += batch_idx * columns;
+
+  // Load the start/end for this bin.
+  int bin_idx = blockIdx.x;
+  int start = 0;
+  if (bin_idx > 0) start = __ldg(bins + bin_idx - 1);
+  int end = __ldg(bins + bin_idx);
+
+  // Load the value to replicate.
+  T value = __ldg((T*)x + bin_idx);
+
+  // Offset to this threadblocks bin and this threads
+  // offset within the bin.
+  int bin_offset = blockIdx.z * kThreadsPerBlock + threadIdx.x;
+  out += start + bin_offset;
+
+  // Replicate the value to the output.
+  //
+  // TODO(tgale): Vectorize these stores.
+  int num_elements = end - start;
+  const int kElementsPerLoop = gridDim.z * kThreadsPerBlock;
+  T *out_ptr = (T*)out;
+  for (; bin_offset < num_elements; num_elements -= kElementsPerLoop) {
+    *out_ptr = value;
+    out_ptr += kElementsPerLoop;
+  }
+}
+
+template <typename T>
+cudaError_t ReplicateForward(T *x,
+			     int batch_size,
+			     int num_bins,
+			     int *bins,
+			     T *out,
+			     int columns,
+			     cudaStream_t stream) {
+  const int kThreadsPerBlock = 64;
+  dim3 block_dim(kThreadsPerBlock, 1, 1);
+  int group_size = std::ceil((float)columns / (num_bins * kThreadsPerBlock));
+  dim3 grid_dim(num_bins, batch_size, group_size);
+  ReplicateForwardKernel<T, kThreadsPerBlock><<<
+    grid_dim, block_dim, 0, stream>>>(x, bins, out, columns);
+  return cudaGetLastError();
+}
+
+void cub_segmented_reduce(torch::Tensor grad,
+			  torch::Tensor bins,
+			  torch::Tensor out,
+			  cudaStream_t stream) {
+  // Append a zero to the bin boundaries for CUB.
+  torch::Tensor offsets = torch::empty(bins.numel() + 1, bins.options());
+  CUDA_CALL(cudaMemsetAsync(offsets.data_ptr<int>(),
+			    0,
+			    offsets.numel() * sizeof(int),
+			    stream));
+  CUDA_CALL(cudaMemcpyAsync(offsets.data_ptr<int>() + 1,
+			    bins.data_ptr<int>(),
+			    bins.numel() * sizeof(int),
+			    cudaMemcpyDeviceToDevice,
+			    stream));
+
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr,
+					    scratchpad_bytes,
+					    grad.data_ptr<c10::Half>(),
+					    out.data_ptr<c10::Half>(),
+					    bins.numel(),
+					    offsets.data_ptr<int>(),
+					    offsets.data_ptr<int>() + 1,
+					    stream));
+
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(grad.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel for each batch item.
+  for (int i = 0; i < grad.size(0); ++i) {
+    int num_bins = out.size(1);
+    int num_values = grad.size(1);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(scratchpad.data_ptr<int8_t>(),
+					      scratchpad_bytes,
+					      grad.data_ptr<c10::Half>() + i * num_values,
+					      out.data_ptr<c10::Half>() + i * num_bins,
+					      bins.numel(),
+					      offsets.data_ptr<int>(),
+					      offsets.data_ptr<int>() + 1,
+					      stream));
+  }
+}
+
+}  // namespace replicate
+
+void replicate_forward(torch::Tensor x,
+		       torch::Tensor bins,
+		       torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kFloat16 ||
+	      x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(x.size(0) == out.size(0));
+
+  // One input for each bin (in each batch).
+  TORCH_CHECK(x.size(1) == bins.size(0));
+
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+
+  switch (x.scalar_type()) {
+  case torch::kFloat16:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<c10::Half>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<c10::Half>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  case torch::kInt32:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<int>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<int>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt16);
+  CUDA_CALL(replicate::ReplicateForward(x.data_ptr<short>(),
+					x.size(0),
+					x.size(1),
+					bins.data_ptr<int>(),
+					out.data_ptr<short>(),
+					out.size(1),
+					c10::cuda::getCurrentCUDAStream()));
+}
+
+void replicate_backward(torch::Tensor grad,
+			torch::Tensor bins,
+			torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(grad.is_cuda());
+  TORCH_CHECK(grad.ndimension() == 2);
+  TORCH_CHECK(grad.scalar_type() == torch::kFloat16);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == torch::kFloat16);
+
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(grad.size(0) == out.size(0));
+
+  // One output for each bin (in each batch).
+  TORCH_CHECK(out.size(1) == bins.size(0));
+
+  replicate::cub_segmented_reduce(grad, bins, out, c10::cuda::getCurrentCUDAStream());
+}
+
+}  // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
diff --git a/csrc/sort.h b/csrc/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..251da182b3b488bcc87e8bb05c7400f14650ee61
--- /dev/null
+++ b/csrc/sort.h
@@ -0,0 +1,91 @@
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+
+#include <cstdint>
+
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+
+namespace megablocks {
+
+template <typename T>
+void cub_radix_sort(torch::Tensor x,
+		    int end_bit,
+		    torch::Tensor x_out,
+		    torch::Tensor iota_out) {
+  // Get iota for values in sort.
+  torch::Tensor iota = torch::arange(0, x.numel(), x.options());
+
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr,
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+
+  // Run the kernel.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(scratchpad.data_ptr(),
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit=*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+}
+
+void sort(torch::Tensor x,
+	  int end_bit,
+	  torch::Tensor x_out,
+	  torch::Tensor iota_out) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+  	      x.scalar_type() == torch::kInt32 ||
+  	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(x_out.is_cuda());
+  TORCH_CHECK(x_out.ndimension() == 1);
+  TORCH_CHECK(x_out.scalar_type() == x.scalar_type());
+  TORCH_CHECK(iota_out.is_cuda());
+  TORCH_CHECK(iota_out.ndimension() == 1);
+  TORCH_CHECK(iota_out.scalar_type() == x.scalar_type());
+
+  // Exit early if there is not work to do.
+  if (x_out.numel() == 0) return;
+
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    return cub_radix_sort<short>(x, end_bit, x_out, iota_out);
+  case torch::kInt32:
+    return cub_radix_sort<int>(x, end_bit, x_out, iota_out);
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  return cub_radix_sort<long>(x, end_bit, x_out, iota_out);
+}
+
+}  // namespace megablocks
+
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000000000000000000000000000000000..79a7071f9827c9c92401266437e21ab81308ccec
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,168 @@
+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1753354560,
+        "narHash": "sha256-vmOfRmr0Qm/IbZTWB2sBn+UFrABSTTA/cTg+m27Yt/E=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "7f2aceda2a2e72cd573bdb25e5c0667fd75f89d3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1755181472,
+        "narHash": "sha256-xOXjhehC5xi/XB4fXZ5c0L2sSyDjJQdlH7/BcdHLBaM=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "85da46f660c1c43b40771c3df3b223bb3fa39bec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1752785354,
+        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000000000000000000000000000000..8e228480f9c833760932be6e90951fa5570f159c
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,24 @@
+{
+  description = "Flake for megablocks_moe kernel";
+
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+
+      pythonCheckInputs = pkgs: with pkgs; [ 
+        tqdm
+        py-cpuinfo
+        importlib-metadata
+        torchmetrics
+      ];
+    };
+}
\ No newline at end of file
diff --git a/media/benches_dark_animation.svg b/media/benches_dark_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3f84a1e6d87a5041d69fd11804aa4f7f43710ec2
--- /dev/null
+++ b/media/benches_dark_animation.svg
@@ -0,0 +1,33 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 220" style="background:#101623;font-family:DejaVu Sans Mono,monospace">
+<text x="10" y="25" font-size="14" font-weight="bold" fill="#E6EDF3">kernels-community/megablocks vs Torch - Relative Speed</text>
+<text x="790" y="25" font-size="10" fill="#6B7280" text-anchor="end">PyTorch 2.11.0+cu130 · CPU</text>
+<rect x="180" y="60" width="470" height="30" rx="4" fill="#30363D" stroke="#484F58"/>
+<text x="170" y="79" font-size="10" fill="#E6EDF3" text-anchor="end">MegaBlocksMoeBenchmark.base</text>
+<text x="660" y="79" font-size="10" font-weight="bold" fill="#E6EDF3">1.07x</text>
+<circle cx="188" cy="69" r="8" fill="#FF9D00" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="3.753279483037156s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="188" cy="81" r="8" fill="#6B7280" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="4.0s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<rect x="180" y="110" width="470" height="30" rx="4" fill="#30363D" stroke="#484F58"/>
+<text x="170" y="129" font-size="10" fill="#E6EDF3" text-anchor="end">MegaBlocksMoeBenchmark.large</text>
+<text x="660" y="129" font-size="10" font-weight="bold" fill="#E6EDF3">2.87x</text>
+<circle cx="188" cy="119" r="8" fill="#FF9D00" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="1.3949642208783477s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="188" cy="131" r="8" fill="#6B7280" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="4.0s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="650" cy="200" r="6" fill="#FF9D00" stroke="white"/>
+<text x="662" y="204" font-size="9" fill="#E6EDF3">Kernel</text>
+<circle cx="730" cy="200" r="6" fill="#6B7280" stroke="white"/>
+<text x="742" y="204" font-size="9" fill="#E6EDF3">Torch (ref)</text>
+<g transform="translate(10,191.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
\ No newline at end of file
diff --git a/media/benches_dark_latency.svg b/media/benches_dark_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..20a007a4fee8ee178bcd5d974c9b59ca00372594
--- /dev/null
+++ b/media/benches_dark_latency.svg
@@ -0,0 +1,1969 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="288pt" viewBox="0 0 720 288" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-03-27T20:23:32.349924</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 288 
+L 720 288 
+L 720 0 
+L 0 0 
+z
+" style="fill: #101623"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+L 709.2 10.8 
+L 187.08 10.8 
+z
+" style="fill: #101623"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 187.08 78.657692 
+L 254.984533 78.657692 
+L 254.984533 60.562308 
+L 187.08 60.562308 
+z
+" clip-path="url(#p5cd29b69a9)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 187.08 105.800769 
+L 259.448213 105.800769 
+L 259.448213 87.705385 
+L 187.08 87.705385 
+z
+" clip-path="url(#p5cd29b69a9)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 187.08 169.134615 
+L 308.469787 169.134615 
+L 308.469787 151.039231 
+L 187.08 151.039231 
+z
+" clip-path="url(#p5cd29b69a9)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 187.08 196.277692 
+L 535.16 196.277692 
+L 535.16 178.182308 
+L 187.08 178.182308 
+z
+" clip-path="url(#p5cd29b69a9)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="ma22534cbce" d="M 0 0 
+L 0 3.5 
+" style="stroke: #6b7280; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#ma22534cbce" x="187.08" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <g style="fill: #6b7280" transform="translate(184.069844 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-30" d="M 1509 2344 
+Q 1509 2516 1629 2641 
+Q 1750 2766 1919 2766 
+Q 2094 2766 2219 2641 
+Q 2344 2516 2344 2344 
+Q 2344 2169 2220 2047 
+Q 2097 1925 1919 1925 
+Q 1744 1925 1626 2044 
+Q 1509 2163 1509 2344 
+z
+M 1925 4250 
+Q 1484 4250 1267 3775 
+Q 1050 3300 1050 2328 
+Q 1050 1359 1267 884 
+Q 1484 409 1925 409 
+Q 2369 409 2586 884 
+Q 2803 1359 2803 2328 
+Q 2803 3300 2586 3775 
+Q 2369 4250 1925 4250 
+z
+M 1925 4750 
+Q 2672 4750 3055 4137 
+Q 3438 3525 3438 2328 
+Q 3438 1134 3055 521 
+Q 2672 -91 1925 -91 
+Q 1178 -91 797 521 
+Q 416 1134 416 2328 
+Q 416 3525 797 4137 
+Q 1178 4750 1925 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#ma22534cbce" x="245.535746" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 10 -->
+      <g style="fill: #6b7280" transform="translate(239.515433 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-31" d="M 844 531 
+L 1825 531 
+L 1825 4097 
+L 769 3859 
+L 769 4434 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3419 531 
+L 3419 0 
+L 844 0 
+L 844 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#ma22534cbce" x="303.991492" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 20 -->
+      <g style="fill: #6b7280" transform="translate(297.971179 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-32" d="M 1166 531 
+L 3309 531 
+L 3309 0 
+L 475 0 
+L 475 531 
+Q 1059 1147 1496 1619 
+Q 1934 2091 2100 2284 
+Q 2413 2666 2522 2902 
+Q 2631 3138 2631 3384 
+Q 2631 3775 2401 3997 
+Q 2172 4219 1772 4219 
+Q 1488 4219 1175 4116 
+Q 863 4013 513 3803 
+L 513 4441 
+Q 834 4594 1145 4672 
+Q 1456 4750 1759 4750 
+Q 2444 4750 2861 4386 
+Q 3278 4022 3278 3431 
+Q 3278 3131 3139 2831 
+Q 3000 2531 2688 2169 
+Q 2513 1966 2180 1606 
+Q 1847 1247 1166 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-32"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#ma22534cbce" x="362.447238" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 30 -->
+      <g style="fill: #6b7280" transform="translate(356.426925 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-33" d="M 2425 2497 
+Q 2884 2375 3128 2064 
+Q 3372 1753 3372 1288 
+Q 3372 644 2939 276 
+Q 2506 -91 1741 -91 
+Q 1419 -91 1084 -31 
+Q 750 28 428 141 
+L 428 769 
+Q 747 603 1056 522 
+Q 1366 441 1672 441 
+Q 2191 441 2469 675 
+Q 2747 909 2747 1350 
+Q 2747 1756 2469 1995 
+Q 2191 2234 1716 2234 
+L 1234 2234 
+L 1234 2753 
+L 1716 2753 
+Q 2150 2753 2394 2943 
+Q 2638 3134 2638 3475 
+Q 2638 3834 2411 4026 
+Q 2184 4219 1766 4219 
+Q 1488 4219 1191 4156 
+Q 894 4094 569 3969 
+L 569 4550 
+Q 947 4650 1242 4700 
+Q 1538 4750 1766 4750 
+Q 2447 4750 2855 4408 
+Q 3263 4066 3263 3500 
+Q 3263 3116 3048 2859 
+Q 2834 2603 2425 2497 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-33"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#ma22534cbce" x="420.902984" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 40 -->
+      <g style="fill: #6b7280" transform="translate(414.882671 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-34" d="M 2297 4091 
+L 825 1625 
+L 2297 1625 
+L 2297 4091 
+z
+M 2194 4666 
+L 2925 4666 
+L 2925 1625 
+L 3547 1625 
+L 3547 1113 
+L 2925 1113 
+L 2925 0 
+L 2297 0 
+L 2297 1113 
+L 319 1113 
+L 319 1709 
+L 2194 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-34"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#ma22534cbce" x="479.35873" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 50 -->
+      <g style="fill: #6b7280" transform="translate(473.338417 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-35" d="M 647 4666 
+L 3009 4666 
+L 3009 4134 
+L 1222 4134 
+L 1222 2988 
+Q 1356 3038 1492 3061 
+Q 1628 3084 1766 3084 
+Q 2491 3084 2916 2656 
+Q 3341 2228 3341 1497 
+Q 3341 759 2895 334 
+Q 2450 -91 1678 -91 
+Q 1306 -91 998 -41 
+Q 691 9 447 109 
+L 447 750 
+Q 734 594 1025 517 
+Q 1316 441 1619 441 
+Q 2141 441 2423 716 
+Q 2706 991 2706 1497 
+Q 2706 1997 2414 2275 
+Q 2122 2553 1600 2553 
+Q 1347 2553 1106 2495 
+Q 866 2438 647 2322 
+L 647 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-35"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#ma22534cbce" x="537.814475" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 60 -->
+      <g style="fill: #6b7280" transform="translate(531.794163 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-36" d="M 3097 4563 
+L 3097 3981 
+Q 2900 4097 2678 4158 
+Q 2456 4219 2216 4219 
+Q 1616 4219 1306 3767 
+Q 997 3316 997 2438 
+Q 1147 2750 1412 2917 
+Q 1678 3084 2022 3084 
+Q 2697 3084 3067 2670 
+Q 3438 2256 3438 1497 
+Q 3438 741 3056 325 
+Q 2675 -91 1984 -91 
+Q 1172 -91 794 492 
+Q 416 1075 416 2328 
+Q 416 3509 870 4129 
+Q 1325 4750 2188 4750 
+Q 2419 4750 2650 4701 
+Q 2881 4653 3097 4563 
+z
+M 1972 2591 
+Q 1569 2591 1337 2300 
+Q 1106 2009 1106 1497 
+Q 1106 984 1337 693 
+Q 1569 403 1972 403 
+Q 2391 403 2603 679 
+Q 2816 956 2816 1497 
+Q 2816 2041 2603 2316 
+Q 2391 2591 1972 2591 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-36"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#ma22534cbce" x="596.270221" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 70 -->
+      <g style="fill: #6b7280" transform="translate(590.249909 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-37" d="M 434 4666 
+L 3372 4666 
+L 3372 4397 
+L 1703 0 
+L 1044 0 
+L 2669 4134 
+L 434 4134 
+L 434 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-37"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#ma22534cbce" x="654.725967" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 80 -->
+      <g style="fill: #6b7280" transform="translate(648.705655 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-38" d="M 1925 2216 
+Q 1503 2216 1273 1980 
+Q 1044 1744 1044 1313 
+Q 1044 881 1276 642 
+Q 1509 403 1925 403 
+Q 2350 403 2579 639 
+Q 2809 875 2809 1313 
+Q 2809 1741 2576 1978 
+Q 2344 2216 1925 2216 
+z
+M 1375 2478 
+Q 972 2581 745 2862 
+Q 519 3144 519 3541 
+Q 519 4097 897 4423 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4423 
+Q 3334 4097 3334 3541 
+Q 3334 3144 3107 2862 
+Q 2881 2581 2478 2478 
+Q 2947 2375 3195 2062 
+Q 3444 1750 3444 1253 
+Q 3444 622 3041 265 
+Q 2638 -91 1925 -91 
+Q 1213 -91 811 264 
+Q 409 619 409 1247 
+Q 409 1747 657 2061 
+Q 906 2375 1375 2478 
+z
+M 1147 3481 
+Q 1147 3106 1347 2909 
+Q 1547 2713 1925 2713 
+Q 2306 2713 2506 2909 
+Q 2706 3106 2706 3481 
+Q 2706 3863 2507 4063 
+Q 2309 4263 1925 4263 
+Q 1547 4263 1347 4061 
+Q 1147 3859 1147 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-38"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Time (ms)  &lt;-  shorter is better -->
+     <g style="fill: #e6edf3" transform="translate(351.815 274.422813) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSansMono-54" d="M 147 4666 
+L 3706 4666 
+L 3706 4134 
+L 2247 4134 
+L 2247 0 
+L 1613 0 
+L 1613 4134 
+L 147 4134 
+L 147 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-69" d="M 800 3500 
+L 2272 3500 
+L 2272 447 
+L 3413 447 
+L 3413 0 
+L 556 0 
+L 556 447 
+L 1697 447 
+L 1697 3053 
+L 800 3053 
+L 800 3500 
+z
+M 1697 4863 
+L 2272 4863 
+L 2272 4134 
+L 1697 4134 
+L 1697 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6d" d="M 2113 3144 
+Q 2219 3369 2383 3476 
+Q 2547 3584 2778 3584 
+Q 3200 3584 3373 3257 
+Q 3547 2931 3547 2028 
+L 3547 0 
+L 3022 0 
+L 3022 2003 
+Q 3022 2744 2939 2923 
+Q 2856 3103 2638 3103 
+Q 2388 3103 2295 2911 
+Q 2203 2719 2203 2003 
+L 2203 0 
+L 1678 0 
+L 1678 2003 
+Q 1678 2753 1589 2928 
+Q 1500 3103 1269 3103 
+Q 1041 3103 952 2911 
+Q 863 2719 863 2003 
+L 863 0 
+L 341 0 
+L 341 3500 
+L 863 3500 
+L 863 3200 
+Q 966 3388 1120 3486 
+Q 1275 3584 1472 3584 
+Q 1709 3584 1867 3475 
+Q 2025 3366 2113 3144 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-65" d="M 3475 1894 
+L 3475 1613 
+L 984 1613 
+L 984 1594 
+Q 984 1022 1282 709 
+Q 1581 397 2125 397 
+Q 2400 397 2700 484 
+Q 3000 572 3341 750 
+L 3341 178 
+Q 3013 44 2708 -23 
+Q 2403 -91 2119 -91 
+Q 1303 -91 843 398 
+Q 384 888 384 1747 
+Q 384 2584 834 3084 
+Q 1284 3584 2034 3584 
+Q 2703 3584 3089 3131 
+Q 3475 2678 3475 1894 
+z
+M 2900 2063 
+Q 2888 2569 2661 2833 
+Q 2434 3097 2009 3097 
+Q 1594 3097 1325 2822 
+Q 1056 2547 1006 2059 
+L 2900 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-28" d="M 2766 4856 
+Q 2350 4144 2145 3436 
+Q 1941 2728 1941 2009 
+Q 1941 1294 2145 584 
+Q 2350 -125 2766 -844 
+L 2266 -844 
+Q 1794 -100 1562 604 
+Q 1331 1309 1331 2009 
+Q 1331 2706 1562 3412 
+Q 1794 4119 2266 4856 
+L 2766 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-73" d="M 3041 3378 
+L 3041 2816 
+Q 2794 2959 2544 3031 
+Q 2294 3103 2034 3103 
+Q 1644 3103 1451 2976 
+Q 1259 2850 1259 2591 
+Q 1259 2356 1403 2240 
+Q 1547 2125 2119 2016 
+L 2350 1972 
+Q 2778 1891 2998 1647 
+Q 3219 1403 3219 1013 
+Q 3219 494 2850 201 
+Q 2481 -91 1825 -91 
+Q 1566 -91 1281 -36 
+Q 997 19 666 128 
+L 666 722 
+Q 988 556 1281 473 
+Q 1575 391 1838 391 
+Q 2219 391 2428 545 
+Q 2638 700 2638 978 
+Q 2638 1378 1872 1531 
+L 1847 1538 
+L 1631 1581 
+Q 1134 1678 906 1908 
+Q 678 2138 678 2534 
+Q 678 3038 1018 3311 
+Q 1359 3584 1991 3584 
+Q 2272 3584 2531 3532 
+Q 2791 3481 3041 3378 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-29" d="M 1088 4856 
+L 1588 4856 
+Q 2059 4119 2290 3412 
+Q 2522 2706 2522 2009 
+Q 2522 1306 2290 600 
+Q 2059 -106 1588 -844 
+L 1088 -844 
+Q 1503 -119 1708 590 
+Q 1913 1300 1913 2009 
+Q 1913 2722 1708 3431 
+Q 1503 4141 1088 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-3c" d="M 3578 3003 
+L 922 2003 
+L 3578 1013 
+L 3578 441 
+L 275 1747 
+L 275 2266 
+L 3578 3572 
+L 3578 3003 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-2d" d="M 1113 2009 
+L 2741 2009 
+L 2741 1497 
+L 1113 1497 
+L 1113 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-68" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 4863 
+L 1184 4863 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6f" d="M 1925 3097 
+Q 1488 3097 1263 2756 
+Q 1038 2416 1038 1747 
+Q 1038 1081 1263 739 
+Q 1488 397 1925 397 
+Q 2366 397 2591 739 
+Q 2816 1081 2816 1747 
+Q 2816 2416 2591 2756 
+Q 2366 3097 1925 3097 
+z
+M 1925 3584 
+Q 2653 3584 3039 3112 
+Q 3425 2641 3425 1747 
+Q 3425 850 3040 379 
+Q 2656 -91 1925 -91 
+Q 1197 -91 812 379 
+Q 428 850 428 1747 
+Q 428 2641 812 3112 
+Q 1197 3584 1925 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-72" d="M 3609 2778 
+Q 3425 2922 3234 2987 
+Q 3044 3053 2816 3053 
+Q 2278 3053 1993 2715 
+Q 1709 2378 1709 1741 
+L 1709 0 
+L 1131 0 
+L 1131 3500 
+L 1709 3500 
+L 1709 2816 
+Q 1853 3188 2151 3386 
+Q 2450 3584 2859 3584 
+Q 3072 3584 3256 3531 
+Q 3441 3478 3609 3366 
+L 3609 2778 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-74" d="M 1919 4494 
+L 1919 3500 
+L 3225 3500 
+L 3225 3053 
+L 1919 3053 
+L 1919 1153 
+Q 1919 766 2066 612 
+Q 2213 459 2578 459 
+L 3225 459 
+L 3225 0 
+L 2522 0 
+Q 1875 0 1609 259 
+Q 1344 519 1344 1153 
+L 1344 3053 
+L 409 3053 
+L 409 3500 
+L 1344 3500 
+L 1344 4494 
+L 1919 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-62" d="M 2869 1747 
+Q 2869 2416 2656 2756 
+Q 2444 3097 2028 3097 
+Q 1609 3097 1393 2755 
+Q 1178 2413 1178 1747 
+Q 1178 1084 1393 740 
+Q 1609 397 2028 397 
+Q 2444 397 2656 737 
+Q 2869 1078 2869 1747 
+z
+M 1178 3053 
+Q 1316 3309 1558 3446 
+Q 1800 3584 2119 3584 
+Q 2750 3584 3112 3098 
+Q 3475 2613 3475 1759 
+Q 3475 894 3111 401 
+Q 2747 -91 2113 -91 
+Q 1800 -91 1561 45 
+Q 1322 181 1178 441 
+L 1178 0 
+L 603 0 
+L 603 4863 
+L 1178 4863 
+L 1178 3053 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-6d" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(602.050781 0)"/>
+      <use xlink:href="#DejaVuSansMono-3c" transform="translate(662.255859 0)"/>
+      <use xlink:href="#DejaVuSansMono-2d" transform="translate(722.460938 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(782.666016 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(842.871094 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(903.076172 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(963.28125 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1023.486328 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1083.691406 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1143.896484 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1204.101562 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1264.306641 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(1384.716797 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(1444.921875 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1505.126953 0)"/>
+      <use xlink:href="#DejaVuSansMono-62" transform="translate(1565.332031 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1625.537109 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1685.742188 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1745.947266 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1806.152344 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1866.357422 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2"/>
+   <g id="patch_7">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+" style="fill: none; stroke: #6b7280; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_11">
+    <!-- 11.62 ms -->
+    <g style="fill: #e6edf3" transform="translate(261.946133 72.117344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-2e" d="M 1528 953 
+L 2316 953 
+L 2316 0 
+L 1528 0 
+L 1528 953 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-31" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-36" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-32" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_12">
+    <!-- 12.38 ms -->
+    <g style="fill: #e6edf3" transform="translate(266.409813 99.260421) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-32" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-33" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_13">
+    <!-- MegaBlocksMoeBenchmark.base -->
+    <g style="fill: #e6edf3" transform="translate(17.569963 85.940913) scale(0.1 -0.1)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-4d" d="M 269 4666 
+L 1369 4666 
+L 1925 2619 
+L 2478 4666 
+L 3584 4666 
+L 3584 0 
+L 2791 0 
+L 2791 3738 
+L 2297 1697 
+L 1563 1697 
+L 1063 3738 
+L 1063 0 
+L 269 0 
+L 269 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-65" d="M 3444 172 
+Q 3125 41 2794 -25 
+Q 2463 -91 2094 -91 
+Q 1216 -91 752 379 
+Q 288 850 288 1734 
+Q 288 2591 734 3087 
+Q 1181 3584 1953 3584 
+Q 2731 3584 3161 3123 
+Q 3591 2663 3591 1825 
+L 3591 1453 
+L 1219 1453 
+Q 1222 1041 1462 837 
+Q 1703 634 2181 634 
+Q 2497 634 2803 725 
+Q 3109 816 3444 1013 
+L 3444 172 
+z
+M 2669 2141 
+Q 2663 2503 2483 2689 
+Q 2303 2875 1953 2875 
+Q 1638 2875 1450 2683 
+Q 1263 2491 1228 2138 
+L 2669 2141 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-67" d="M 2509 1806 
+Q 2509 2275 2331 2558 
+Q 2153 2841 1863 2841 
+Q 1575 2841 1398 2559 
+Q 1222 2278 1222 1806 
+Q 1222 1331 1398 1050 
+Q 1575 769 1863 769 
+Q 2153 769 2331 1051 
+Q 2509 1334 2509 1806 
+z
+M 3425 263 
+Q 3425 -578 3045 -951 
+Q 2666 -1325 1813 -1325 
+Q 1525 -1325 1244 -1283 
+Q 963 -1241 672 -1153 
+L 672 -313 
+Q 931 -456 1200 -525 
+Q 1469 -594 1753 -594 
+Q 2141 -594 2325 -411 
+Q 2509 -228 2509 159 
+L 2509 538 
+Q 2375 288 2153 166 
+Q 1931 44 1613 44 
+Q 1013 44 659 512 
+Q 306 981 306 1784 
+Q 306 2616 659 3103 
+Q 1013 3591 1606 3591 
+Q 1906 3591 2140 3450 
+Q 2375 3309 2509 3053 
+L 2509 3500 
+L 3425 3500 
+L 3425 263 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-61" d="M 2188 1644 
+Q 1675 1644 1472 1512 
+Q 1269 1381 1269 1063 
+Q 1269 825 1409 684 
+Q 1550 544 1791 544 
+Q 2153 544 2353 817 
+Q 2553 1091 2553 1581 
+L 2553 1644 
+L 2188 1644 
+z
+M 3463 1997 
+L 3463 0 
+L 2553 0 
+L 2553 391 
+Q 2388 159 2128 34 
+Q 1869 -91 1556 -91 
+Q 959 -91 626 225 
+Q 294 541 294 1106 
+Q 294 1719 691 2011 
+Q 1088 2303 1919 2303 
+L 2553 2303 
+L 2553 2456 
+Q 2553 2678 2392 2792 
+Q 2231 2906 1919 2906 
+Q 1591 2906 1283 2823 
+Q 975 2741 641 2559 
+L 641 3341 
+Q 944 3466 1256 3525 
+Q 1569 3584 1919 3584 
+Q 2772 3584 3117 3237 
+Q 3463 2891 3463 1997 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-42" d="M 1281 2119 
+L 1281 738 
+L 1894 738 
+Q 2334 738 2511 888 
+Q 2688 1038 2688 1409 
+Q 2688 1788 2503 1953 
+Q 2319 2119 1894 2119 
+L 1281 2119 
+z
+M 1281 3938 
+L 1281 2853 
+L 1894 2853 
+Q 2244 2853 2392 2978 
+Q 2541 3103 2541 3391 
+Q 2541 3678 2389 3808 
+Q 2238 3938 1894 3938 
+L 1281 3938 
+z
+M 391 4672 
+L 1894 4672 
+Q 2672 4672 3064 4376 
+Q 3456 4081 3456 3494 
+Q 3456 3044 3225 2791 
+Q 2994 2538 2547 2497 
+Q 3081 2444 3351 2137 
+Q 3622 1831 3622 1281 
+Q 3622 606 3215 303 
+Q 2809 0 1894 0 
+L 391 0 
+L 391 4672 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6c" d="M 1209 1459 
+L 1209 4159 
+L 281 4159 
+L 281 4863 
+L 2125 4863 
+L 2125 1459 
+Q 2125 1047 2253 875 
+Q 2381 703 2688 703 
+L 3419 703 
+L 3419 0 
+L 2431 0 
+Q 1778 0 1493 337 
+Q 1209 675 1209 1459 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6f" d="M 1925 2841 
+Q 1597 2841 1409 2552 
+Q 1222 2263 1222 1747 
+Q 1222 1231 1409 942 
+Q 1597 653 1925 653 
+Q 2256 653 2443 942 
+Q 2631 1231 2631 1747 
+Q 2631 2263 2443 2552 
+Q 2256 2841 1925 2841 
+z
+M 306 1747 
+Q 306 2594 745 3089 
+Q 1184 3584 1925 3584 
+Q 2669 3584 3108 3089 
+Q 3547 2594 3547 1747 
+Q 3547 900 3108 404 
+Q 2669 -91 1925 -91 
+Q 1184 -91 745 404 
+Q 306 900 306 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-63" d="M 3316 178 
+Q 3084 44 2818 -23 
+Q 2553 -91 2247 -91 
+Q 1438 -91 981 396 
+Q 525 884 525 1747 
+Q 525 2613 984 3102 
+Q 1444 3591 2253 3591 
+Q 2534 3591 2795 3525 
+Q 3056 3459 3316 3322 
+L 3316 2484 
+Q 3116 2656 2877 2748 
+Q 2638 2841 2381 2841 
+Q 1934 2841 1693 2556 
+Q 1453 2272 1453 1747 
+Q 1453 1222 1693 940 
+Q 1934 659 2381 659 
+Q 2647 659 2878 748 
+Q 3109 838 3316 1019 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6b" d="M 544 4863 
+L 1459 4863 
+L 1459 2309 
+L 2559 3500 
+L 3669 3500 
+L 2344 2194 
+L 3744 0 
+L 2731 0 
+L 1772 1638 
+L 1459 1338 
+L 1459 0 
+L 544 0 
+L 544 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-73" d="M 3097 3391 
+L 3097 2591 
+Q 2844 2753 2570 2836 
+Q 2297 2919 2022 2919 
+Q 1716 2919 1559 2830 
+Q 1403 2741 1403 2566 
+Q 1403 2316 2072 2159 
+L 2106 2150 
+L 2369 2088 
+Q 2869 1969 3101 1705 
+Q 3334 1441 3334 991 
+Q 3334 450 2979 179 
+Q 2625 -91 1913 -91 
+Q 1597 -91 1265 -36 
+Q 934 19 594 128 
+L 594 928 
+Q 897 756 1211 665 
+Q 1525 575 1819 575 
+Q 2141 575 2306 669 
+Q 2472 763 2472 941 
+Q 2472 1116 2355 1209 
+Q 2238 1303 1797 1409 
+L 1544 1466 
+Q 1019 1584 778 1837 
+Q 538 2091 538 2516 
+Q 538 3022 903 3303 
+Q 1269 3584 1931 3584 
+Q 2228 3584 2523 3536 
+Q 2819 3488 3097 3391 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6e" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2513 2330 2677 
+Q 2222 2841 1978 2841 
+Q 1731 2841 1589 2619 
+Q 1447 2397 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 3500 
+L 1447 3500 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-68" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2509 2330 2671 
+Q 2222 2834 1978 2834 
+Q 1728 2834 1587 2614 
+Q 1447 2394 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 4863 
+L 1447 4863 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6d" d="M 2156 3150 
+Q 2259 3372 2418 3478 
+Q 2578 3584 2809 3584 
+Q 3263 3584 3436 3271 
+Q 3609 2959 3609 1972 
+L 3609 0 
+L 2859 0 
+L 2859 2247 
+Q 2859 2638 2800 2769 
+Q 2741 2900 2584 2900 
+Q 2428 2900 2365 2765 
+Q 2303 2631 2303 2247 
+L 2303 0 
+L 1563 0 
+L 1563 2247 
+Q 1563 2631 1500 2765 
+Q 1438 2900 1281 2900 
+Q 1125 2900 1065 2769 
+Q 1006 2638 1006 2247 
+L 1006 0 
+L 256 0 
+L 256 3500 
+L 922 3500 
+L 922 3138 
+Q 1000 3344 1172 3464 
+Q 1344 3584 1556 3584 
+Q 1769 3584 1944 3457 
+Q 2119 3331 2156 3150 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-72" d="M 3597 2534 
+Q 3447 2672 3245 2740 
+Q 3044 2809 2803 2809 
+Q 2513 2809 2295 2707 
+Q 2078 2606 1959 2413 
+Q 1884 2294 1854 2125 
+Q 1825 1956 1825 1613 
+L 1825 0 
+L 909 0 
+L 909 3500 
+L 1825 3500 
+L 1825 2956 
+Q 1959 3256 2237 3420 
+Q 2516 3584 2888 3584 
+Q 3075 3584 3254 3539 
+Q 3434 3494 3597 3406 
+L 3597 2534 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-2e" d="M 1403 1147 
+L 2444 1147 
+L 2444 0 
+L 1403 0 
+L 1403 1147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-62" d="M 2656 1741 
+Q 2656 2247 2487 2534 
+Q 2319 2822 2022 2822 
+Q 1725 2822 1553 2534 
+Q 1381 2247 1381 1741 
+Q 1381 1234 1553 946 
+Q 1725 659 2022 659 
+Q 2319 659 2487 946 
+Q 2656 1234 2656 1741 
+z
+M 1381 3003 
+Q 1550 3294 1773 3439 
+Q 1997 3584 2278 3584 
+Q 2900 3584 3236 3109 
+Q 3572 2634 3572 1747 
+Q 3572 872 3240 390 
+Q 2909 -91 2309 -91 
+Q 1994 -91 1759 62 
+Q 1525 216 1381 519 
+L 1381 0 
+L 469 0 
+L 469 4863 
+L 1381 4863 
+L 1381 3003 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1565.332031 0)"/>
+    </g>
+   </g>
+   <g id="text_14">
+    <!--   1.07x faster -->
+    <g style="fill: #ff9d00" transform="translate(311.660213 85.664976) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-20" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-31" d="M 588 813 
+L 1619 813 
+L 1619 3841 
+L 659 3603 
+L 659 4434 
+L 1625 4666 
+L 2516 4666 
+L 2516 813 
+L 3547 813 
+L 3547 0 
+L 588 0 
+L 588 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-30" d="M 1538 2328 
+Q 1538 2494 1650 2606 
+Q 1763 2719 1925 2719 
+Q 2091 2719 2203 2606 
+Q 2316 2494 2316 2328 
+Q 2316 2166 2203 2053 
+Q 2091 1941 1925 1941 
+Q 1763 1941 1650 2052 
+Q 1538 2163 1538 2328 
+z
+M 1925 3969 
+Q 1606 3969 1459 3578 
+Q 1313 3188 1313 2328 
+Q 1313 1472 1459 1081 
+Q 1606 691 1925 691 
+Q 2247 691 2394 1081 
+Q 2541 1472 2541 2328 
+Q 2541 3188 2394 3578 
+Q 2247 3969 1925 3969 
+z
+M 384 2328 
+Q 384 3541 768 4145 
+Q 1153 4750 1925 4750 
+Q 2700 4750 3084 4147 
+Q 3469 3544 3469 2328 
+Q 3469 1116 3084 512 
+Q 2700 -91 1925 -91 
+Q 1153 -91 768 514 
+Q 384 1119 384 2328 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-37" d="M 422 4666 
+L 3372 4666 
+L 3372 4013 
+L 1791 0 
+L 850 0 
+L 2369 3853 
+L 422 3853 
+L 422 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-78" d="M 3578 3500 
+L 2400 1825 
+L 3681 0 
+L 2613 0 
+L 1925 1178 
+L 1241 0 
+L 172 0 
+L 1466 1825 
+L 275 3500 
+L 1344 3500 
+L 1925 2456 
+L 2509 3500 
+L 3578 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-66" d="M 2309 3806 
+L 2309 3500 
+L 3384 3500 
+L 3384 2797 
+L 2309 2797 
+L 2309 0 
+L 1394 0 
+L 1394 2797 
+L 544 2797 
+L 544 3500 
+L 1394 3500 
+L 1394 3744 
+Q 1394 4375 1656 4619 
+Q 1919 4863 2631 4863 
+L 3384 4863 
+L 3384 4159 
+L 2669 4159 
+Q 2463 4159 2389 4084 
+Q 2316 4009 2309 3806 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-74" d="M 2156 4494 
+L 2156 3500 
+L 3353 3500 
+L 3353 2797 
+L 2156 2797 
+L 2156 1141 
+Q 2156 906 2270 804 
+Q 2384 703 2650 703 
+L 3353 703 
+L 3353 0 
+L 2584 0 
+Q 1797 0 1519 251 
+Q 1241 503 1241 1184 
+L 1241 2797 
+L 347 2797 
+L 347 3500 
+L 1241 3500 
+L 1241 4494 
+L 2156 4494 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-31" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-30" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="text_15">
+    <!-- 20.77 ms -->
+    <g style="fill: #e6edf3" transform="translate(315.431387 162.594267) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-32"/>
+     <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_16">
+    <!-- 59.55 ms -->
+    <g style="fill: #e6edf3" transform="translate(542.1216 189.737344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-39" d="M 1863 2069 
+Q 2266 2069 2495 2359 
+Q 2725 2650 2725 3163 
+Q 2725 3675 2495 3965 
+Q 2266 4256 1863 4256 
+Q 1444 4256 1231 3979 
+Q 1019 3703 1019 3163 
+Q 1019 2619 1230 2344 
+Q 1441 2069 1863 2069 
+z
+M 738 97 
+L 738 678 
+Q 934 563 1156 502 
+Q 1378 441 1619 441 
+Q 2219 441 2526 892 
+Q 2834 1344 2834 2222 
+Q 2688 1909 2422 1742 
+Q 2156 1575 1813 1575 
+Q 1138 1575 767 1990 
+Q 397 2406 397 3169 
+Q 397 3922 776 4336 
+Q 1156 4750 1850 4750 
+Q 2663 4750 3041 4165 
+Q 3419 3581 3419 2328 
+Q 3419 1150 2964 529 
+Q 2509 -91 1644 -91 
+Q 1416 -91 1184 -42 
+Q 953 6 738 97 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-35"/>
+     <use xlink:href="#DejaVuSansMono-39" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-35" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-35" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_17">
+    <!-- MegaBlocksMoeBenchmark.large -->
+    <g style="fill: #e6edf3" transform="translate(11.54965 176.417837) scale(0.1 -0.1)">
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1565.332031 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1625.537109 0)"/>
+    </g>
+   </g>
+   <g id="text_18">
+    <!--   2.87x faster -->
+    <g style="fill: #ff9d00" transform="translate(587.372 176.141899) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-32" d="M 1356 813 
+L 3322 813 
+L 3322 0 
+L 359 0 
+L 359 788 
+L 859 1319 
+Q 1750 2266 1941 2484 
+Q 2175 2753 2278 2961 
+Q 2381 3169 2381 3372 
+Q 2381 3684 2192 3854 
+Q 2003 4025 1656 4025 
+Q 1409 4025 1101 3926 
+Q 794 3828 459 3641 
+L 459 4500 
+Q 794 4622 1114 4686 
+Q 1434 4750 1728 4750 
+Q 2469 4750 2892 4404 
+Q 3316 4059 3316 3463 
+Q 3316 3188 3223 2947 
+Q 3131 2706 2906 2413 
+Q 2741 2200 1997 1456 
+Q 1594 1053 1356 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-38" d="M 1925 2081 
+Q 1616 2081 1425 1886 
+Q 1234 1691 1234 1369 
+Q 1234 1047 1425 850 
+Q 1616 653 1925 653 
+Q 2234 653 2426 853 
+Q 2619 1053 2619 1369 
+Q 2619 1691 2428 1886 
+Q 2238 2081 1925 2081 
+z
+M 1241 2472 
+Q 888 2594 703 2855 
+Q 519 3116 519 3494 
+Q 519 4075 897 4412 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4412 
+Q 3334 4075 3334 3494 
+Q 3334 3119 3153 2858 
+Q 2972 2597 2625 2472 
+Q 3013 2353 3231 2047 
+Q 3450 1741 3450 1313 
+Q 3450 641 3053 275 
+Q 2656 -91 1925 -91 
+Q 1197 -91 800 275 
+Q 403 641 403 1313 
+Q 403 1744 625 2050 
+Q 847 2356 1241 2472 
+z
+M 1338 3419 
+Q 1338 3144 1495 2983 
+Q 1653 2822 1925 2822 
+Q 2200 2822 2358 2983 
+Q 2516 3144 2516 3419 
+Q 2516 3691 2358 3848 
+Q 2200 4006 1925 4006 
+Q 1656 4006 1497 3847 
+Q 1338 3688 1338 3419 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-32" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="legend_1">
+    <g id="patch_8">
+     <path d="M 614.498906 56.27825 
+L 702.9 56.27825 
+Q 704.7 56.27825 704.7 54.47825 
+L 704.7 28.862 
+Q 704.7 27.062 702.9 27.062 
+L 614.498906 27.062 
+Q 612.698906 27.062 612.698906 28.862 
+L 612.698906 54.47825 
+Q 612.698906 56.27825 614.498906 56.27825 
+z
+" style="fill: #101623; opacity: 0.8; stroke: #6b7280; stroke-linejoin: miter"/>
+    </g>
+    <g id="patch_9">
+     <path d="M 616.298906 37.548406 
+L 634.298906 37.548406 
+L 634.298906 31.248406 
+L 616.298906 31.248406 
+z
+" style="fill: #ff9d00; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_19">
+     <!-- Kernel -->
+     <g style="fill: #e6edf3" transform="translate(641.498906 37.548406) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-4b" d="M 428 4666 
+L 1063 4666 
+L 1063 2591 
+L 3034 4666 
+L 3775 4666 
+L 1959 2759 
+L 3828 0 
+L 3066 0 
+L 1544 2338 
+L 1063 1825 
+L 1063 0 
+L 428 0 
+L 428 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6e" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 3500 
+L 1184 3500 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6c" d="M 1997 1269 
+Q 1997 881 2139 684 
+Q 2281 488 2559 488 
+L 3231 488 
+L 3231 0 
+L 2503 0 
+Q 1988 0 1705 331 
+Q 1422 663 1422 1269 
+L 1422 4447 
+L 500 4447 
+L 500 4897 
+L 1997 4897 
+L 1997 1269 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4b"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(301.025391 0)"/>
+     </g>
+    </g>
+    <g id="patch_10">
+     <path d="M 616.298906 50.806531 
+L 634.298906 50.806531 
+L 634.298906 44.506531 
+L 616.298906 44.506531 
+z
+" style="fill: #6b7280; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_20">
+     <!-- Torch (ref) -->
+     <g style="fill: #e6edf3" transform="translate(641.498906 50.806531) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-63" d="M 3316 178 
+Q 3084 44 2839 -23 
+Q 2594 -91 2338 -91 
+Q 1525 -91 1067 396 
+Q 609 884 609 1747 
+Q 609 2609 1067 3096 
+Q 1525 3584 2338 3584 
+Q 2591 3584 2831 3518 
+Q 3072 3453 3316 3316 
+L 3316 2713 
+Q 3088 2916 2858 3006 
+Q 2628 3097 2338 3097 
+Q 1797 3097 1506 2747 
+Q 1216 2397 1216 1747 
+Q 1216 1100 1508 748 
+Q 1800 397 2338 397 
+Q 2638 397 2875 489 
+Q 3113 581 3316 775 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-66" d="M 3322 4863 
+L 3322 4384 
+L 2669 4384 
+Q 2359 4384 2239 4257 
+Q 2119 4131 2119 3809 
+L 2119 3500 
+L 3322 3500 
+L 3322 3053 
+L 2119 3053 
+L 2119 0 
+L 1544 0 
+L 1544 3053 
+L 609 3053 
+L 609 3500 
+L 1544 3500 
+L 1544 3744 
+Q 1544 4319 1808 4591 
+Q 2072 4863 2631 4863 
+L 3322 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-66" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(602.050781 0)"/>
+     </g>
+    </g>
+   </g>
+  </g>
+  <g id="text_21">
+   <!-- kernels-community/megablocks vs Torch - Latency -->
+   <g style="fill: #e6edf3" transform="translate(14.4 17.128438) scale(0.14 -0.14)">
+    <defs>
+     <path id="DejaVuSansMono-Bold-2d" d="M 941 2297 
+L 2913 2297 
+L 2913 1388 
+L 941 1388 
+L 941 2297 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-75" d="M 500 1222 
+L 500 3500 
+L 1416 3500 
+L 1416 1363 
+Q 1416 984 1522 821 
+Q 1628 659 1875 659 
+Q 2122 659 2261 878 
+Q 2400 1097 2400 1491 
+L 2400 3500 
+L 3316 3500 
+L 3316 0 
+L 2400 0 
+L 2400 519 
+Q 2303 228 2070 68 
+Q 1838 -91 1509 -91 
+Q 1009 -91 754 240 
+Q 500 572 500 1222 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-69" d="M 691 3500 
+L 2503 3500 
+L 2503 703 
+L 3641 703 
+L 3641 0 
+L 447 0 
+L 447 703 
+L 1588 703 
+L 1588 2797 
+L 691 2797 
+L 691 3500 
+z
+M 1588 5203 
+L 2503 5203 
+L 2503 4134 
+L 1588 4134 
+L 1588 5203 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-79" d="M 2222 -378 
+Q 2038 -872 1780 -1098 
+Q 1522 -1325 1153 -1325 
+L 397 -1325 
+L 397 -628 
+L 769 -628 
+Q 1050 -628 1181 -533 
+Q 1313 -438 1447 -91 
+L 1516 97 
+L 184 3500 
+L 1147 3500 
+L 1947 1228 
+L 2713 3500 
+L 3675 3500 
+L 2222 -378 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-2f" d="M 2809 4666 
+L 3500 4666 
+L 1044 -594 
+L 353 -594 
+L 2809 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-76" d="M 3603 3500 
+L 2491 0 
+L 1363 0 
+L 250 3500 
+L 1178 3500 
+L 1925 769 
+L 2675 3500 
+L 3603 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-54" d="M 2388 0 
+L 1466 0 
+L 1466 3859 
+L 281 3859 
+L 281 4666 
+L 3572 4666 
+L 3572 3859 
+L 2388 3859 
+L 2388 0 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-4c" d="M 703 0 
+L 703 4666 
+L 1625 4666 
+L 1625 813 
+L 3597 813 
+L 3597 0 
+L 703 0 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-Bold-6b"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-69" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2f" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(1505.126953 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1565.332031 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1625.537109 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1685.742188 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-76" transform="translate(1745.947266 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1806.152344 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1866.357422 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(1926.5625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1986.767578 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2046.972656 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2107.177734 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2167.382812 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2227.587891 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(2287.792969 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2347.998047 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-4c" transform="translate(2408.203125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(2468.408203 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(2528.613281 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(2588.818359 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(2649.023438 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2709.228516 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(2769.433594 0)"/>
+   </g>
+  </g>
+  <g id="text_22">
+   <!-- PyTorch 2.11.0+cu130 . CPU -->
+   <g style="fill: #6b7280" transform="translate(549.071875 13.411563) scale(0.1 -0.1)">
+    <defs>
+     <path id="DejaVuSansMono-50" d="M 1247 4147 
+L 1247 2394 
+L 1978 2394 
+Q 2416 2394 2661 2625 
+Q 2906 2856 2906 3272 
+Q 2906 3688 2662 3917 
+Q 2419 4147 1978 4147 
+L 1247 4147 
+z
+M 616 4666 
+L 1978 4666 
+Q 2759 4666 3162 4311 
+Q 3566 3956 3566 3272 
+Q 3566 2581 3164 2228 
+Q 2763 1875 1978 1875 
+L 1247 1875 
+L 1247 0 
+L 616 0 
+L 616 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-79" d="M 2681 1125 
+Q 2538 759 2316 163 
+Q 2006 -663 1900 -844 
+Q 1756 -1088 1540 -1209 
+Q 1325 -1331 1038 -1331 
+L 575 -1331 
+L 575 -850 
+L 916 -850 
+Q 1169 -850 1312 -703 
+Q 1456 -556 1678 56 
+L 325 3500 
+L 934 3500 
+L 1972 763 
+L 2994 3500 
+L 3603 3500 
+L 2681 1125 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2b" d="M 2188 3659 
+L 2188 2272 
+L 3578 2272 
+L 3578 1741 
+L 2188 1741 
+L 2188 353 
+L 1663 353 
+L 1663 1741 
+L 275 1741 
+L 275 2272 
+L 1663 2272 
+L 1663 3659 
+L 2188 3659 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-75" d="M 609 1325 
+L 609 3494 
+L 1184 3494 
+L 1184 1325 
+Q 1184 853 1351 631 
+Q 1519 409 1869 409 
+Q 2275 409 2490 695 
+Q 2706 981 2706 1516 
+L 2706 3494 
+L 3284 3494 
+L 3284 0 
+L 2706 0 
+L 2706 525 
+Q 2553 222 2289 65 
+Q 2025 -91 1672 -91 
+Q 1134 -91 871 261 
+Q 609 613 609 1325 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-43" d="M 3353 166 
+Q 3113 38 2859 -26 
+Q 2606 -91 2322 -91 
+Q 1425 -91 929 543 
+Q 434 1178 434 2328 
+Q 434 3472 932 4111 
+Q 1431 4750 2322 4750 
+Q 2606 4750 2859 4686 
+Q 3113 4622 3353 4494 
+L 3353 3847 
+Q 3122 4038 2856 4138 
+Q 2591 4238 2322 4238 
+Q 1706 4238 1400 3763 
+Q 1094 3288 1094 2328 
+Q 1094 1372 1400 897 
+Q 1706 422 2322 422 
+Q 2597 422 2861 522 
+Q 3125 622 3353 813 
+L 3353 166 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-55" d="M 459 1791 
+L 459 4666 
+L 1094 4666 
+L 1094 1503 
+Q 1094 1163 1112 1017 
+Q 1131 872 1178 794 
+Q 1278 609 1467 515 
+Q 1656 422 1925 422 
+Q 2197 422 2384 515 
+Q 2572 609 2675 794 
+Q 2722 872 2740 1015 
+Q 2759 1159 2759 1497 
+L 2759 4666 
+L 3391 4666 
+L 3391 1791 
+Q 3391 1075 3302 773 
+Q 3213 472 2994 275 
+Q 2788 91 2522 0 
+Q 2256 -91 1925 -91 
+Q 1597 -91 1331 0 
+Q 1066 91 856 275 
+Q 641 469 550 776 
+Q 459 1084 459 1791 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-50"/>
+    <use xlink:href="#DejaVuSansMono-79" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-54" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-72" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-68" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-32" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-2b" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-75" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-33" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-43" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-50" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-55" transform="translate(1505.126953 0)"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p5cd29b69a9">
+   <rect x="187.08" y="10.8" width="522.12" height="235.24"/>
+  </clipPath>
+ </defs>
+<g transform="translate(10,259.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
diff --git a/media/benches_dark_throughput.svg b/media/benches_dark_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..115e62a55ff9a5234404c03e34b65deacc47ab97
--- /dev/null
+++ b/media/benches_dark_throughput.svg
@@ -0,0 +1,2034 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="288pt" viewBox="0 0 720 288" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-03-27T20:23:32.525123</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 288 
+L 720 288 
+L 720 0 
+L 0 0 
+z
+" style="fill: #101623"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+L 709.2 10.8 
+L 187.08 10.8 
+z
+" style="fill: #101623"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 187.08 78.657692 
+L 535.16 78.657692 
+L 535.16 60.562308 
+L 187.08 60.562308 
+z
+" clip-path="url(#p8be45cf807)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 187.08 105.800769 
+L 513.690381 105.800769 
+L 513.690381 87.705385 
+L 187.08 87.705385 
+z
+" clip-path="url(#p8be45cf807)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 187.08 169.134615 
+L 381.793331 169.134615 
+L 381.793331 151.039231 
+L 187.08 151.039231 
+z
+" clip-path="url(#p8be45cf807)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 187.08 196.277692 
+L 254.984533 196.277692 
+L 254.984533 178.182308 
+L 187.08 178.182308 
+z
+" clip-path="url(#p8be45cf807)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m82c8405d62" d="M 0 0 
+L 0 3.5 
+" style="stroke: #6b7280; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m82c8405d62" x="187.08" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <g style="fill: #6b7280" transform="translate(184.069844 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-30" d="M 1509 2344 
+Q 1509 2516 1629 2641 
+Q 1750 2766 1919 2766 
+Q 2094 2766 2219 2641 
+Q 2344 2516 2344 2344 
+Q 2344 2169 2220 2047 
+Q 2097 1925 1919 1925 
+Q 1744 1925 1626 2044 
+Q 1509 2163 1509 2344 
+z
+M 1925 4250 
+Q 1484 4250 1267 3775 
+Q 1050 3300 1050 2328 
+Q 1050 1359 1267 884 
+Q 1484 409 1925 409 
+Q 2369 409 2586 884 
+Q 2803 1359 2803 2328 
+Q 2803 3300 2586 3775 
+Q 2369 4250 1925 4250 
+z
+M 1925 4750 
+Q 2672 4750 3055 4137 
+Q 3438 3525 3438 2328 
+Q 3438 1134 3055 521 
+Q 2672 -91 1925 -91 
+Q 1178 -91 797 521 
+Q 416 1134 416 2328 
+Q 416 3525 797 4137 
+Q 1178 4750 1925 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m82c8405d62" x="267.94873" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 20 -->
+      <g style="fill: #6b7280" transform="translate(261.928418 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-32" d="M 1166 531 
+L 3309 531 
+L 3309 0 
+L 475 0 
+L 475 531 
+Q 1059 1147 1496 1619 
+Q 1934 2091 2100 2284 
+Q 2413 2666 2522 2902 
+Q 2631 3138 2631 3384 
+Q 2631 3775 2401 3997 
+Q 2172 4219 1772 4219 
+Q 1488 4219 1175 4116 
+Q 863 4013 513 3803 
+L 513 4441 
+Q 834 4594 1145 4672 
+Q 1456 4750 1759 4750 
+Q 2444 4750 2861 4386 
+Q 3278 4022 3278 3431 
+Q 3278 3131 3139 2831 
+Q 3000 2531 2688 2169 
+Q 2513 1966 2180 1606 
+Q 1847 1247 1166 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-32"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m82c8405d62" x="348.81746" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 40 -->
+      <g style="fill: #6b7280" transform="translate(342.797148 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-34" d="M 2297 4091 
+L 825 1625 
+L 2297 1625 
+L 2297 4091 
+z
+M 2194 4666 
+L 2925 4666 
+L 2925 1625 
+L 3547 1625 
+L 3547 1113 
+L 2925 1113 
+L 2925 0 
+L 2297 0 
+L 2297 1113 
+L 319 1113 
+L 319 1709 
+L 2194 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-34"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m82c8405d62" x="429.686191" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 60 -->
+      <g style="fill: #6b7280" transform="translate(423.665878 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-36" d="M 3097 4563 
+L 3097 3981 
+Q 2900 4097 2678 4158 
+Q 2456 4219 2216 4219 
+Q 1616 4219 1306 3767 
+Q 997 3316 997 2438 
+Q 1147 2750 1412 2917 
+Q 1678 3084 2022 3084 
+Q 2697 3084 3067 2670 
+Q 3438 2256 3438 1497 
+Q 3438 741 3056 325 
+Q 2675 -91 1984 -91 
+Q 1172 -91 794 492 
+Q 416 1075 416 2328 
+Q 416 3509 870 4129 
+Q 1325 4750 2188 4750 
+Q 2419 4750 2650 4701 
+Q 2881 4653 3097 4563 
+z
+M 1972 2591 
+Q 1569 2591 1337 2300 
+Q 1106 2009 1106 1497 
+Q 1106 984 1337 693 
+Q 1569 403 1972 403 
+Q 2391 403 2603 679 
+Q 2816 956 2816 1497 
+Q 2816 2041 2603 2316 
+Q 2391 2591 1972 2591 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-36"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m82c8405d62" x="510.554921" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 80 -->
+      <g style="fill: #6b7280" transform="translate(504.534608 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-38" d="M 1925 2216 
+Q 1503 2216 1273 1980 
+Q 1044 1744 1044 1313 
+Q 1044 881 1276 642 
+Q 1509 403 1925 403 
+Q 2350 403 2579 639 
+Q 2809 875 2809 1313 
+Q 2809 1741 2576 1978 
+Q 2344 2216 1925 2216 
+z
+M 1375 2478 
+Q 972 2581 745 2862 
+Q 519 3144 519 3541 
+Q 519 4097 897 4423 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4423 
+Q 3334 4097 3334 3541 
+Q 3334 3144 3107 2862 
+Q 2881 2581 2478 2478 
+Q 2947 2375 3195 2062 
+Q 3444 1750 3444 1253 
+Q 3444 622 3041 265 
+Q 2638 -91 1925 -91 
+Q 1213 -91 811 264 
+Q 409 619 409 1247 
+Q 409 1747 657 2061 
+Q 906 2375 1375 2478 
+z
+M 1147 3481 
+Q 1147 3106 1347 2909 
+Q 1547 2713 1925 2713 
+Q 2306 2713 2506 2909 
+Q 2706 3106 2706 3481 
+Q 2706 3863 2507 4063 
+Q 2309 4263 1925 4263 
+Q 1547 4263 1347 4061 
+Q 1147 3859 1147 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-38"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m82c8405d62" x="591.423651" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 100 -->
+      <g style="fill: #6b7280" transform="translate(582.393182 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-31" d="M 844 531 
+L 1825 531 
+L 1825 4097 
+L 769 3859 
+L 769 4434 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3419 531 
+L 3419 0 
+L 844 0 
+L 844 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(120.410156 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m82c8405d62" x="672.292381" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 120 -->
+      <g style="fill: #6b7280" transform="translate(663.261913 260.691562) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-32" transform="translate(60.205078 0)"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(120.410156 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_8">
+     <!-- Operations per second  -&gt;  longer is better -->
+     <g style="fill: #e6edf3" transform="translate(318.703281 274.422813) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSansMono-4f" d="M 2828 2328 
+Q 2828 3356 2617 3797 
+Q 2406 4238 1925 4238 
+Q 1447 4238 1236 3797 
+Q 1025 3356 1025 2328 
+Q 1025 1303 1236 862 
+Q 1447 422 1925 422 
+Q 2406 422 2617 861 
+Q 2828 1300 2828 2328 
+z
+M 3488 2328 
+Q 3488 1109 3102 509 
+Q 2716 -91 1925 -91 
+Q 1134 -91 750 506 
+Q 366 1103 366 2328 
+Q 366 3550 752 4150 
+Q 1138 4750 1925 4750 
+Q 2716 4750 3102 4150 
+Q 3488 3550 3488 2328 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-70" d="M 1172 441 
+L 1172 -1331 
+L 594 -1331 
+L 594 3500 
+L 1172 3500 
+L 1172 3053 
+Q 1316 3313 1555 3448 
+Q 1794 3584 2106 3584 
+Q 2741 3584 3102 3093 
+Q 3463 2603 3463 1734 
+Q 3463 881 3100 395 
+Q 2738 -91 2106 -91 
+Q 1788 -91 1548 45 
+Q 1309 181 1172 441 
+z
+M 2859 1747 
+Q 2859 2416 2648 2756 
+Q 2438 3097 2022 3097 
+Q 1603 3097 1387 2755 
+Q 1172 2413 1172 1747 
+Q 1172 1084 1387 740 
+Q 1603 397 2022 397 
+Q 2438 397 2648 737 
+Q 2859 1078 2859 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-65" d="M 3475 1894 
+L 3475 1613 
+L 984 1613 
+L 984 1594 
+Q 984 1022 1282 709 
+Q 1581 397 2125 397 
+Q 2400 397 2700 484 
+Q 3000 572 3341 750 
+L 3341 178 
+Q 3013 44 2708 -23 
+Q 2403 -91 2119 -91 
+Q 1303 -91 843 398 
+Q 384 888 384 1747 
+Q 384 2584 834 3084 
+Q 1284 3584 2034 3584 
+Q 2703 3584 3089 3131 
+Q 3475 2678 3475 1894 
+z
+M 2900 2063 
+Q 2888 2569 2661 2833 
+Q 2434 3097 2009 3097 
+Q 1594 3097 1325 2822 
+Q 1056 2547 1006 2059 
+L 2900 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-72" d="M 3609 2778 
+Q 3425 2922 3234 2987 
+Q 3044 3053 2816 3053 
+Q 2278 3053 1993 2715 
+Q 1709 2378 1709 1741 
+L 1709 0 
+L 1131 0 
+L 1131 3500 
+L 1709 3500 
+L 1709 2816 
+Q 1853 3188 2151 3386 
+Q 2450 3584 2859 3584 
+Q 3072 3584 3256 3531 
+Q 3441 3478 3609 3366 
+L 3609 2778 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-61" d="M 2194 1759 
+L 2003 1759 
+Q 1500 1759 1245 1582 
+Q 991 1406 991 1056 
+Q 991 741 1181 566 
+Q 1372 391 1709 391 
+Q 2184 391 2456 720 
+Q 2728 1050 2731 1631 
+L 2731 1759 
+L 2194 1759 
+z
+M 3309 1997 
+L 3309 0 
+L 2731 0 
+L 2731 519 
+Q 2547 206 2267 57 
+Q 1988 -91 1588 -91 
+Q 1053 -91 734 211 
+Q 416 513 416 1019 
+Q 416 1603 808 1906 
+Q 1200 2209 1959 2209 
+L 2731 2209 
+L 2731 2300 
+Q 2728 2719 2518 2908 
+Q 2309 3097 1850 3097 
+Q 1556 3097 1256 3012 
+Q 956 2928 672 2766 
+L 672 3341 
+Q 991 3463 1283 3523 
+Q 1575 3584 1850 3584 
+Q 2284 3584 2592 3456 
+Q 2900 3328 3091 3072 
+Q 3209 2916 3259 2686 
+Q 3309 2456 3309 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-74" d="M 1919 4494 
+L 1919 3500 
+L 3225 3500 
+L 3225 3053 
+L 1919 3053 
+L 1919 1153 
+Q 1919 766 2066 612 
+Q 2213 459 2578 459 
+L 3225 459 
+L 3225 0 
+L 2522 0 
+Q 1875 0 1609 259 
+Q 1344 519 1344 1153 
+L 1344 3053 
+L 409 3053 
+L 409 3500 
+L 1344 3500 
+L 1344 4494 
+L 1919 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-69" d="M 800 3500 
+L 2272 3500 
+L 2272 447 
+L 3413 447 
+L 3413 0 
+L 556 0 
+L 556 447 
+L 1697 447 
+L 1697 3053 
+L 800 3053 
+L 800 3500 
+z
+M 1697 4863 
+L 2272 4863 
+L 2272 4134 
+L 1697 4134 
+L 1697 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6f" d="M 1925 3097 
+Q 1488 3097 1263 2756 
+Q 1038 2416 1038 1747 
+Q 1038 1081 1263 739 
+Q 1488 397 1925 397 
+Q 2366 397 2591 739 
+Q 2816 1081 2816 1747 
+Q 2816 2416 2591 2756 
+Q 2366 3097 1925 3097 
+z
+M 1925 3584 
+Q 2653 3584 3039 3112 
+Q 3425 2641 3425 1747 
+Q 3425 850 3040 379 
+Q 2656 -91 1925 -91 
+Q 1197 -91 812 379 
+Q 428 850 428 1747 
+Q 428 2641 812 3112 
+Q 1197 3584 1925 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6e" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 3500 
+L 1184 3500 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-73" d="M 3041 3378 
+L 3041 2816 
+Q 2794 2959 2544 3031 
+Q 2294 3103 2034 3103 
+Q 1644 3103 1451 2976 
+Q 1259 2850 1259 2591 
+Q 1259 2356 1403 2240 
+Q 1547 2125 2119 2016 
+L 2350 1972 
+Q 2778 1891 2998 1647 
+Q 3219 1403 3219 1013 
+Q 3219 494 2850 201 
+Q 2481 -91 1825 -91 
+Q 1566 -91 1281 -36 
+Q 997 19 666 128 
+L 666 722 
+Q 988 556 1281 473 
+Q 1575 391 1838 391 
+Q 2219 391 2428 545 
+Q 2638 700 2638 978 
+Q 2638 1378 1872 1531 
+L 1847 1538 
+L 1631 1581 
+Q 1134 1678 906 1908 
+Q 678 2138 678 2534 
+Q 678 3038 1018 3311 
+Q 1359 3584 1991 3584 
+Q 2272 3584 2531 3532 
+Q 2791 3481 3041 3378 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-63" d="M 3316 178 
+Q 3084 44 2839 -23 
+Q 2594 -91 2338 -91 
+Q 1525 -91 1067 396 
+Q 609 884 609 1747 
+Q 609 2609 1067 3096 
+Q 1525 3584 2338 3584 
+Q 2591 3584 2831 3518 
+Q 3072 3453 3316 3316 
+L 3316 2713 
+Q 3088 2916 2858 3006 
+Q 2628 3097 2338 3097 
+Q 1797 3097 1506 2747 
+Q 1216 2397 1216 1747 
+Q 1216 1100 1508 748 
+Q 1800 397 2338 397 
+Q 2638 397 2875 489 
+Q 3113 581 3316 775 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-64" d="M 2681 3053 
+L 2681 4863 
+L 3256 4863 
+L 3256 0 
+L 2681 0 
+L 2681 441 
+Q 2538 181 2298 45 
+Q 2059 -91 1747 -91 
+Q 1113 -91 748 401 
+Q 384 894 384 1759 
+Q 384 2613 750 3098 
+Q 1116 3584 1747 3584 
+Q 2063 3584 2303 3448 
+Q 2544 3313 2681 3053 
+z
+M 991 1747 
+Q 991 1078 1203 737 
+Q 1416 397 1831 397 
+Q 2247 397 2464 740 
+Q 2681 1084 2681 1747 
+Q 2681 2413 2464 2755 
+Q 2247 3097 1831 3097 
+Q 1416 3097 1203 2756 
+Q 991 2416 991 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-2d" d="M 1113 2009 
+L 2741 2009 
+L 2741 1497 
+L 1113 1497 
+L 1113 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-3e" d="M 275 3003 
+L 275 3572 
+L 3578 2266 
+L 3578 1747 
+L 275 441 
+L 275 1013 
+L 2931 2003 
+L 275 3003 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6c" d="M 1997 1269 
+Q 1997 881 2139 684 
+Q 2281 488 2559 488 
+L 3231 488 
+L 3231 0 
+L 2503 0 
+Q 1988 0 1705 331 
+Q 1422 663 1422 1269 
+L 1422 4447 
+L 500 4447 
+L 500 4897 
+L 1997 4897 
+L 1997 1269 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-67" d="M 2681 1778 
+Q 2681 2425 2470 2761 
+Q 2259 3097 1856 3097 
+Q 1434 3097 1212 2761 
+Q 991 2425 991 1778 
+Q 991 1131 1214 792 
+Q 1438 453 1863 453 
+Q 2259 453 2470 793 
+Q 2681 1134 2681 1778 
+z
+M 3256 225 
+Q 3256 -563 2884 -969 
+Q 2513 -1375 1791 -1375 
+Q 1553 -1375 1293 -1331 
+Q 1034 -1288 775 -1203 
+L 775 -634 
+Q 1081 -778 1331 -847 
+Q 1581 -916 1791 -916 
+Q 2256 -916 2468 -662 
+Q 2681 -409 2681 141 
+L 2681 166 
+L 2681 556 
+Q 2544 263 2306 119 
+Q 2069 -25 1728 -25 
+Q 1116 -25 750 465 
+Q 384 956 384 1778 
+Q 384 2603 750 3093 
+Q 1116 3584 1728 3584 
+Q 2066 3584 2300 3450 
+Q 2534 3316 2681 3034 
+L 2681 3488 
+L 3256 3488 
+L 3256 225 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-62" d="M 2869 1747 
+Q 2869 2416 2656 2756 
+Q 2444 3097 2028 3097 
+Q 1609 3097 1393 2755 
+Q 1178 2413 1178 1747 
+Q 1178 1084 1393 740 
+Q 1609 397 2028 397 
+Q 2444 397 2656 737 
+Q 2869 1078 2869 1747 
+z
+M 1178 3053 
+Q 1316 3309 1558 3446 
+Q 1800 3584 2119 3584 
+Q 2750 3584 3112 3098 
+Q 3475 2613 3475 1759 
+Q 3475 894 3111 401 
+Q 2747 -91 2113 -91 
+Q 1800 -91 1561 45 
+Q 1322 181 1178 441 
+L 1178 0 
+L 603 0 
+L 603 4863 
+L 1178 4863 
+L 1178 3053 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4f"/>
+      <use xlink:href="#DejaVuSansMono-70" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-61" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(602.050781 0)"/>
+      <use xlink:href="#DejaVuSansMono-70" transform="translate(662.255859 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(722.460938 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(782.666016 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(842.871094 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(903.076172 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(963.28125 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(1023.486328 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1083.691406 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(1143.896484 0)"/>
+      <use xlink:href="#DejaVuSansMono-64" transform="translate(1204.101562 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1264.306641 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+      <use xlink:href="#DejaVuSansMono-2d" transform="translate(1384.716797 0)"/>
+      <use xlink:href="#DejaVuSansMono-3e" transform="translate(1444.921875 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1505.126953 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1565.332031 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(1625.537109 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1685.742188 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(1745.947266 0)"/>
+      <use xlink:href="#DejaVuSansMono-67" transform="translate(1806.152344 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1866.357422 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1926.5625 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1986.767578 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(2046.972656 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(2107.177734 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(2167.382812 0)"/>
+      <use xlink:href="#DejaVuSansMono-62" transform="translate(2227.587891 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(2287.792969 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(2347.998047 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(2408.203125 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(2468.408203 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(2528.613281 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2"/>
+   <g id="patch_7">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+" style="fill: none; stroke: #6b7280; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_9">
+    <!-- 86 ops/s -->
+    <g style="fill: #e6edf3" transform="translate(542.1216 72.117344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-2f" d="M 2778 4666 
+L 3372 4666 
+L 916 -594 
+L 319 -594 
+L 2778 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-38"/>
+     <use xlink:href="#DejaVuSansMono-36" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_10">
+    <!-- 81 ops/s -->
+    <g style="fill: #e6edf3" transform="translate(520.651981 99.260421) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-38"/>
+     <use xlink:href="#DejaVuSansMono-31" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_11">
+    <!-- MegaBlocksMoeBenchmark.base -->
+    <g style="fill: #e6edf3" transform="translate(17.569963 85.940913) scale(0.1 -0.1)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-4d" d="M 269 4666 
+L 1369 4666 
+L 1925 2619 
+L 2478 4666 
+L 3584 4666 
+L 3584 0 
+L 2791 0 
+L 2791 3738 
+L 2297 1697 
+L 1563 1697 
+L 1063 3738 
+L 1063 0 
+L 269 0 
+L 269 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-65" d="M 3444 172 
+Q 3125 41 2794 -25 
+Q 2463 -91 2094 -91 
+Q 1216 -91 752 379 
+Q 288 850 288 1734 
+Q 288 2591 734 3087 
+Q 1181 3584 1953 3584 
+Q 2731 3584 3161 3123 
+Q 3591 2663 3591 1825 
+L 3591 1453 
+L 1219 1453 
+Q 1222 1041 1462 837 
+Q 1703 634 2181 634 
+Q 2497 634 2803 725 
+Q 3109 816 3444 1013 
+L 3444 172 
+z
+M 2669 2141 
+Q 2663 2503 2483 2689 
+Q 2303 2875 1953 2875 
+Q 1638 2875 1450 2683 
+Q 1263 2491 1228 2138 
+L 2669 2141 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-67" d="M 2509 1806 
+Q 2509 2275 2331 2558 
+Q 2153 2841 1863 2841 
+Q 1575 2841 1398 2559 
+Q 1222 2278 1222 1806 
+Q 1222 1331 1398 1050 
+Q 1575 769 1863 769 
+Q 2153 769 2331 1051 
+Q 2509 1334 2509 1806 
+z
+M 3425 263 
+Q 3425 -578 3045 -951 
+Q 2666 -1325 1813 -1325 
+Q 1525 -1325 1244 -1283 
+Q 963 -1241 672 -1153 
+L 672 -313 
+Q 931 -456 1200 -525 
+Q 1469 -594 1753 -594 
+Q 2141 -594 2325 -411 
+Q 2509 -228 2509 159 
+L 2509 538 
+Q 2375 288 2153 166 
+Q 1931 44 1613 44 
+Q 1013 44 659 512 
+Q 306 981 306 1784 
+Q 306 2616 659 3103 
+Q 1013 3591 1606 3591 
+Q 1906 3591 2140 3450 
+Q 2375 3309 2509 3053 
+L 2509 3500 
+L 3425 3500 
+L 3425 263 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-61" d="M 2188 1644 
+Q 1675 1644 1472 1512 
+Q 1269 1381 1269 1063 
+Q 1269 825 1409 684 
+Q 1550 544 1791 544 
+Q 2153 544 2353 817 
+Q 2553 1091 2553 1581 
+L 2553 1644 
+L 2188 1644 
+z
+M 3463 1997 
+L 3463 0 
+L 2553 0 
+L 2553 391 
+Q 2388 159 2128 34 
+Q 1869 -91 1556 -91 
+Q 959 -91 626 225 
+Q 294 541 294 1106 
+Q 294 1719 691 2011 
+Q 1088 2303 1919 2303 
+L 2553 2303 
+L 2553 2456 
+Q 2553 2678 2392 2792 
+Q 2231 2906 1919 2906 
+Q 1591 2906 1283 2823 
+Q 975 2741 641 2559 
+L 641 3341 
+Q 944 3466 1256 3525 
+Q 1569 3584 1919 3584 
+Q 2772 3584 3117 3237 
+Q 3463 2891 3463 1997 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-42" d="M 1281 2119 
+L 1281 738 
+L 1894 738 
+Q 2334 738 2511 888 
+Q 2688 1038 2688 1409 
+Q 2688 1788 2503 1953 
+Q 2319 2119 1894 2119 
+L 1281 2119 
+z
+M 1281 3938 
+L 1281 2853 
+L 1894 2853 
+Q 2244 2853 2392 2978 
+Q 2541 3103 2541 3391 
+Q 2541 3678 2389 3808 
+Q 2238 3938 1894 3938 
+L 1281 3938 
+z
+M 391 4672 
+L 1894 4672 
+Q 2672 4672 3064 4376 
+Q 3456 4081 3456 3494 
+Q 3456 3044 3225 2791 
+Q 2994 2538 2547 2497 
+Q 3081 2444 3351 2137 
+Q 3622 1831 3622 1281 
+Q 3622 606 3215 303 
+Q 2809 0 1894 0 
+L 391 0 
+L 391 4672 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6c" d="M 1209 1459 
+L 1209 4159 
+L 281 4159 
+L 281 4863 
+L 2125 4863 
+L 2125 1459 
+Q 2125 1047 2253 875 
+Q 2381 703 2688 703 
+L 3419 703 
+L 3419 0 
+L 2431 0 
+Q 1778 0 1493 337 
+Q 1209 675 1209 1459 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6f" d="M 1925 2841 
+Q 1597 2841 1409 2552 
+Q 1222 2263 1222 1747 
+Q 1222 1231 1409 942 
+Q 1597 653 1925 653 
+Q 2256 653 2443 942 
+Q 2631 1231 2631 1747 
+Q 2631 2263 2443 2552 
+Q 2256 2841 1925 2841 
+z
+M 306 1747 
+Q 306 2594 745 3089 
+Q 1184 3584 1925 3584 
+Q 2669 3584 3108 3089 
+Q 3547 2594 3547 1747 
+Q 3547 900 3108 404 
+Q 2669 -91 1925 -91 
+Q 1184 -91 745 404 
+Q 306 900 306 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-63" d="M 3316 178 
+Q 3084 44 2818 -23 
+Q 2553 -91 2247 -91 
+Q 1438 -91 981 396 
+Q 525 884 525 1747 
+Q 525 2613 984 3102 
+Q 1444 3591 2253 3591 
+Q 2534 3591 2795 3525 
+Q 3056 3459 3316 3322 
+L 3316 2484 
+Q 3116 2656 2877 2748 
+Q 2638 2841 2381 2841 
+Q 1934 2841 1693 2556 
+Q 1453 2272 1453 1747 
+Q 1453 1222 1693 940 
+Q 1934 659 2381 659 
+Q 2647 659 2878 748 
+Q 3109 838 3316 1019 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6b" d="M 544 4863 
+L 1459 4863 
+L 1459 2309 
+L 2559 3500 
+L 3669 3500 
+L 2344 2194 
+L 3744 0 
+L 2731 0 
+L 1772 1638 
+L 1459 1338 
+L 1459 0 
+L 544 0 
+L 544 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-73" d="M 3097 3391 
+L 3097 2591 
+Q 2844 2753 2570 2836 
+Q 2297 2919 2022 2919 
+Q 1716 2919 1559 2830 
+Q 1403 2741 1403 2566 
+Q 1403 2316 2072 2159 
+L 2106 2150 
+L 2369 2088 
+Q 2869 1969 3101 1705 
+Q 3334 1441 3334 991 
+Q 3334 450 2979 179 
+Q 2625 -91 1913 -91 
+Q 1597 -91 1265 -36 
+Q 934 19 594 128 
+L 594 928 
+Q 897 756 1211 665 
+Q 1525 575 1819 575 
+Q 2141 575 2306 669 
+Q 2472 763 2472 941 
+Q 2472 1116 2355 1209 
+Q 2238 1303 1797 1409 
+L 1544 1466 
+Q 1019 1584 778 1837 
+Q 538 2091 538 2516 
+Q 538 3022 903 3303 
+Q 1269 3584 1931 3584 
+Q 2228 3584 2523 3536 
+Q 2819 3488 3097 3391 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6e" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2513 2330 2677 
+Q 2222 2841 1978 2841 
+Q 1731 2841 1589 2619 
+Q 1447 2397 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 3500 
+L 1447 3500 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-68" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2509 2330 2671 
+Q 2222 2834 1978 2834 
+Q 1728 2834 1587 2614 
+Q 1447 2394 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 4863 
+L 1447 4863 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6d" d="M 2156 3150 
+Q 2259 3372 2418 3478 
+Q 2578 3584 2809 3584 
+Q 3263 3584 3436 3271 
+Q 3609 2959 3609 1972 
+L 3609 0 
+L 2859 0 
+L 2859 2247 
+Q 2859 2638 2800 2769 
+Q 2741 2900 2584 2900 
+Q 2428 2900 2365 2765 
+Q 2303 2631 2303 2247 
+L 2303 0 
+L 1563 0 
+L 1563 2247 
+Q 1563 2631 1500 2765 
+Q 1438 2900 1281 2900 
+Q 1125 2900 1065 2769 
+Q 1006 2638 1006 2247 
+L 1006 0 
+L 256 0 
+L 256 3500 
+L 922 3500 
+L 922 3138 
+Q 1000 3344 1172 3464 
+Q 1344 3584 1556 3584 
+Q 1769 3584 1944 3457 
+Q 2119 3331 2156 3150 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-72" d="M 3597 2534 
+Q 3447 2672 3245 2740 
+Q 3044 2809 2803 2809 
+Q 2513 2809 2295 2707 
+Q 2078 2606 1959 2413 
+Q 1884 2294 1854 2125 
+Q 1825 1956 1825 1613 
+L 1825 0 
+L 909 0 
+L 909 3500 
+L 1825 3500 
+L 1825 2956 
+Q 1959 3256 2237 3420 
+Q 2516 3584 2888 3584 
+Q 3075 3584 3254 3539 
+Q 3434 3494 3597 3406 
+L 3597 2534 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-2e" d="M 1403 1147 
+L 2444 1147 
+L 2444 0 
+L 1403 0 
+L 1403 1147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-62" d="M 2656 1741 
+Q 2656 2247 2487 2534 
+Q 2319 2822 2022 2822 
+Q 1725 2822 1553 2534 
+Q 1381 2247 1381 1741 
+Q 1381 1234 1553 946 
+Q 1725 659 2022 659 
+Q 2319 659 2487 946 
+Q 2656 1234 2656 1741 
+z
+M 1381 3003 
+Q 1550 3294 1773 3439 
+Q 1997 3584 2278 3584 
+Q 2900 3584 3236 3109 
+Q 3572 2634 3572 1747 
+Q 3572 872 3240 390 
+Q 2909 -91 2309 -91 
+Q 1994 -91 1759 62 
+Q 1525 216 1381 519 
+L 1381 0 
+L 469 0 
+L 469 4863 
+L 1381 4863 
+L 1381 3003 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1565.332031 0)"/>
+    </g>
+   </g>
+   <g id="text_12">
+    <!--   1.07x faster -->
+    <g style="fill: #ff9d00" transform="translate(587.372 85.664976) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-20" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-31" d="M 588 813 
+L 1619 813 
+L 1619 3841 
+L 659 3603 
+L 659 4434 
+L 1625 4666 
+L 2516 4666 
+L 2516 813 
+L 3547 813 
+L 3547 0 
+L 588 0 
+L 588 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-30" d="M 1538 2328 
+Q 1538 2494 1650 2606 
+Q 1763 2719 1925 2719 
+Q 2091 2719 2203 2606 
+Q 2316 2494 2316 2328 
+Q 2316 2166 2203 2053 
+Q 2091 1941 1925 1941 
+Q 1763 1941 1650 2052 
+Q 1538 2163 1538 2328 
+z
+M 1925 3969 
+Q 1606 3969 1459 3578 
+Q 1313 3188 1313 2328 
+Q 1313 1472 1459 1081 
+Q 1606 691 1925 691 
+Q 2247 691 2394 1081 
+Q 2541 1472 2541 2328 
+Q 2541 3188 2394 3578 
+Q 2247 3969 1925 3969 
+z
+M 384 2328 
+Q 384 3541 768 4145 
+Q 1153 4750 1925 4750 
+Q 2700 4750 3084 4147 
+Q 3469 3544 3469 2328 
+Q 3469 1116 3084 512 
+Q 2700 -91 1925 -91 
+Q 1153 -91 768 514 
+Q 384 1119 384 2328 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-37" d="M 422 4666 
+L 3372 4666 
+L 3372 4013 
+L 1791 0 
+L 850 0 
+L 2369 3853 
+L 422 3853 
+L 422 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-78" d="M 3578 3500 
+L 2400 1825 
+L 3681 0 
+L 2613 0 
+L 1925 1178 
+L 1241 0 
+L 172 0 
+L 1466 1825 
+L 275 3500 
+L 1344 3500 
+L 1925 2456 
+L 2509 3500 
+L 3578 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-66" d="M 2309 3806 
+L 2309 3500 
+L 3384 3500 
+L 3384 2797 
+L 2309 2797 
+L 2309 0 
+L 1394 0 
+L 1394 2797 
+L 544 2797 
+L 544 3500 
+L 1394 3500 
+L 1394 3744 
+Q 1394 4375 1656 4619 
+Q 1919 4863 2631 4863 
+L 3384 4863 
+L 3384 4159 
+L 2669 4159 
+Q 2463 4159 2389 4084 
+Q 2316 4009 2309 3806 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-74" d="M 2156 4494 
+L 2156 3500 
+L 3353 3500 
+L 3353 2797 
+L 2156 2797 
+L 2156 1141 
+Q 2156 906 2270 804 
+Q 2384 703 2650 703 
+L 3353 703 
+L 3353 0 
+L 2584 0 
+Q 1797 0 1519 251 
+Q 1241 503 1241 1184 
+L 1241 2797 
+L 347 2797 
+L 347 3500 
+L 1241 3500 
+L 1241 4494 
+L 2156 4494 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-31" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-30" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="text_13">
+    <!-- 48 ops/s -->
+    <g style="fill: #e6edf3" transform="translate(388.754931 162.594267) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-34"/>
+     <use xlink:href="#DejaVuSansMono-38" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_14">
+    <!-- 17 ops/s -->
+    <g style="fill: #e6edf3" transform="translate(261.946133 189.737344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-37" d="M 434 4666 
+L 3372 4666 
+L 3372 4397 
+L 1703 0 
+L 1044 0 
+L 2669 4134 
+L 434 4134 
+L 434 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_15">
+    <!-- MegaBlocksMoeBenchmark.large -->
+    <g style="fill: #e6edf3" transform="translate(11.54965 176.417837) scale(0.1 -0.1)">
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1565.332031 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1625.537109 0)"/>
+    </g>
+   </g>
+   <g id="text_16">
+    <!--   2.87x faster -->
+    <g style="fill: #ff9d00" transform="translate(434.005331 176.141899) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-32" d="M 1356 813 
+L 3322 813 
+L 3322 0 
+L 359 0 
+L 359 788 
+L 859 1319 
+Q 1750 2266 1941 2484 
+Q 2175 2753 2278 2961 
+Q 2381 3169 2381 3372 
+Q 2381 3684 2192 3854 
+Q 2003 4025 1656 4025 
+Q 1409 4025 1101 3926 
+Q 794 3828 459 3641 
+L 459 4500 
+Q 794 4622 1114 4686 
+Q 1434 4750 1728 4750 
+Q 2469 4750 2892 4404 
+Q 3316 4059 3316 3463 
+Q 3316 3188 3223 2947 
+Q 3131 2706 2906 2413 
+Q 2741 2200 1997 1456 
+Q 1594 1053 1356 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-38" d="M 1925 2081 
+Q 1616 2081 1425 1886 
+Q 1234 1691 1234 1369 
+Q 1234 1047 1425 850 
+Q 1616 653 1925 653 
+Q 2234 653 2426 853 
+Q 2619 1053 2619 1369 
+Q 2619 1691 2428 1886 
+Q 2238 2081 1925 2081 
+z
+M 1241 2472 
+Q 888 2594 703 2855 
+Q 519 3116 519 3494 
+Q 519 4075 897 4412 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4412 
+Q 3334 4075 3334 3494 
+Q 3334 3119 3153 2858 
+Q 2972 2597 2625 2472 
+Q 3013 2353 3231 2047 
+Q 3450 1741 3450 1313 
+Q 3450 641 3053 275 
+Q 2656 -91 1925 -91 
+Q 1197 -91 800 275 
+Q 403 641 403 1313 
+Q 403 1744 625 2050 
+Q 847 2356 1241 2472 
+z
+M 1338 3419 
+Q 1338 3144 1495 2983 
+Q 1653 2822 1925 2822 
+Q 2200 2822 2358 2983 
+Q 2516 3144 2516 3419 
+Q 2516 3691 2358 3848 
+Q 2200 4006 1925 4006 
+Q 1656 4006 1497 3847 
+Q 1338 3688 1338 3419 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-32" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="legend_1">
+    <g id="patch_8">
+     <path d="M 614.498906 56.27825 
+L 702.9 56.27825 
+Q 704.7 56.27825 704.7 54.47825 
+L 704.7 28.862 
+Q 704.7 27.062 702.9 27.062 
+L 614.498906 27.062 
+Q 612.698906 27.062 612.698906 28.862 
+L 612.698906 54.47825 
+Q 612.698906 56.27825 614.498906 56.27825 
+z
+" style="fill: #101623; opacity: 0.8; stroke: #6b7280; stroke-linejoin: miter"/>
+    </g>
+    <g id="patch_9">
+     <path d="M 616.298906 37.548406 
+L 634.298906 37.548406 
+L 634.298906 31.248406 
+L 616.298906 31.248406 
+z
+" style="fill: #ff9d00; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_17">
+     <!-- Kernel -->
+     <g style="fill: #e6edf3" transform="translate(641.498906 37.548406) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-4b" d="M 428 4666 
+L 1063 4666 
+L 1063 2591 
+L 3034 4666 
+L 3775 4666 
+L 1959 2759 
+L 3828 0 
+L 3066 0 
+L 1544 2338 
+L 1063 1825 
+L 1063 0 
+L 428 0 
+L 428 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4b"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(301.025391 0)"/>
+     </g>
+    </g>
+    <g id="patch_10">
+     <path d="M 616.298906 50.806531 
+L 634.298906 50.806531 
+L 634.298906 44.506531 
+L 616.298906 44.506531 
+z
+" style="fill: #6b7280; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_18">
+     <!-- Torch (ref) -->
+     <g style="fill: #e6edf3" transform="translate(641.498906 50.806531) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-54" d="M 147 4666 
+L 3706 4666 
+L 3706 4134 
+L 2247 4134 
+L 2247 0 
+L 1613 0 
+L 1613 4134 
+L 147 4134 
+L 147 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-68" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 4863 
+L 1184 4863 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-28" d="M 2766 4856 
+Q 2350 4144 2145 3436 
+Q 1941 2728 1941 2009 
+Q 1941 1294 2145 584 
+Q 2350 -125 2766 -844 
+L 2266 -844 
+Q 1794 -100 1562 604 
+Q 1331 1309 1331 2009 
+Q 1331 2706 1562 3412 
+Q 1794 4119 2266 4856 
+L 2766 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-66" d="M 3322 4863 
+L 3322 4384 
+L 2669 4384 
+Q 2359 4384 2239 4257 
+Q 2119 4131 2119 3809 
+L 2119 3500 
+L 3322 3500 
+L 3322 3053 
+L 2119 3053 
+L 2119 0 
+L 1544 0 
+L 1544 3053 
+L 609 3053 
+L 609 3500 
+L 1544 3500 
+L 1544 3744 
+Q 1544 4319 1808 4591 
+Q 2072 4863 2631 4863 
+L 3322 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-29" d="M 1088 4856 
+L 1588 4856 
+Q 2059 4119 2290 3412 
+Q 2522 2706 2522 2009 
+Q 2522 1306 2290 600 
+Q 2059 -106 1588 -844 
+L 1088 -844 
+Q 1503 -119 1708 590 
+Q 1913 1300 1913 2009 
+Q 1913 2722 1708 3431 
+Q 1503 4141 1088 4856 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-66" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(602.050781 0)"/>
+     </g>
+    </g>
+   </g>
+  </g>
+  <g id="text_19">
+   <!-- kernels-community/megablocks vs Torch - Throughput -->
+   <g style="fill: #e6edf3" transform="translate(14.4 17.141562) scale(0.14 -0.14)">
+    <defs>
+     <path id="DejaVuSansMono-Bold-2d" d="M 941 2297 
+L 2913 2297 
+L 2913 1388 
+L 941 1388 
+L 941 2297 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-75" d="M 500 1222 
+L 500 3500 
+L 1416 3500 
+L 1416 1363 
+Q 1416 984 1522 821 
+Q 1628 659 1875 659 
+Q 2122 659 2261 878 
+Q 2400 1097 2400 1491 
+L 2400 3500 
+L 3316 3500 
+L 3316 0 
+L 2400 0 
+L 2400 519 
+Q 2303 228 2070 68 
+Q 1838 -91 1509 -91 
+Q 1009 -91 754 240 
+Q 500 572 500 1222 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-69" d="M 691 3500 
+L 2503 3500 
+L 2503 703 
+L 3641 703 
+L 3641 0 
+L 447 0 
+L 447 703 
+L 1588 703 
+L 1588 2797 
+L 691 2797 
+L 691 3500 
+z
+M 1588 5203 
+L 2503 5203 
+L 2503 4134 
+L 1588 4134 
+L 1588 5203 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-79" d="M 2222 -378 
+Q 2038 -872 1780 -1098 
+Q 1522 -1325 1153 -1325 
+L 397 -1325 
+L 397 -628 
+L 769 -628 
+Q 1050 -628 1181 -533 
+Q 1313 -438 1447 -91 
+L 1516 97 
+L 184 3500 
+L 1147 3500 
+L 1947 1228 
+L 2713 3500 
+L 3675 3500 
+L 2222 -378 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-2f" d="M 2809 4666 
+L 3500 4666 
+L 1044 -594 
+L 353 -594 
+L 2809 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-76" d="M 3603 3500 
+L 2491 0 
+L 1363 0 
+L 250 3500 
+L 1178 3500 
+L 1925 769 
+L 2675 3500 
+L 3603 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-54" d="M 2388 0 
+L 1466 0 
+L 1466 3859 
+L 281 3859 
+L 281 4666 
+L 3572 4666 
+L 3572 3859 
+L 2388 3859 
+L 2388 0 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-70" d="M 1381 494 
+L 1381 -1331 
+L 469 -1331 
+L 469 3500 
+L 1381 3500 
+L 1381 2975 
+Q 1525 3278 1759 3431 
+Q 1994 3584 2309 3584 
+Q 2909 3584 3240 3103 
+Q 3572 2622 3572 1747 
+Q 3572 859 3236 384 
+Q 2900 -91 2278 -91 
+Q 1997 -91 1773 54 
+Q 1550 200 1381 494 
+z
+M 2656 1753 
+Q 2656 2259 2487 2546 
+Q 2319 2834 2022 2834 
+Q 1725 2834 1553 2546 
+Q 1381 2259 1381 1753 
+Q 1381 1247 1553 959 
+Q 1725 672 2022 672 
+Q 2319 672 2487 959 
+Q 2656 1247 2656 1753 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-Bold-6b"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-69" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2f" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(1505.126953 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1565.332031 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1625.537109 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1685.742188 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-76" transform="translate(1745.947266 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1806.152344 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1866.357422 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(1926.5625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1986.767578 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2046.972656 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2107.177734 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2167.382812 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2227.587891 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(2287.792969 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2347.998047 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(2408.203125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2468.408203 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2528.613281 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(2588.818359 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(2649.023438 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(2709.228516 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2769.433594 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-70" transform="translate(2829.638672 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(2889.84375 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(2950.048828 0)"/>
+   </g>
+  </g>
+  <g id="text_20">
+   <!-- PyTorch 2.11.0+cu130 . CPU -->
+   <g style="fill: #6b7280" transform="translate(549.071875 13.411563) scale(0.1 -0.1)">
+    <defs>
+     <path id="DejaVuSansMono-50" d="M 1247 4147 
+L 1247 2394 
+L 1978 2394 
+Q 2416 2394 2661 2625 
+Q 2906 2856 2906 3272 
+Q 2906 3688 2662 3917 
+Q 2419 4147 1978 4147 
+L 1247 4147 
+z
+M 616 4666 
+L 1978 4666 
+Q 2759 4666 3162 4311 
+Q 3566 3956 3566 3272 
+Q 3566 2581 3164 2228 
+Q 2763 1875 1978 1875 
+L 1247 1875 
+L 1247 0 
+L 616 0 
+L 616 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-79" d="M 2681 1125 
+Q 2538 759 2316 163 
+Q 2006 -663 1900 -844 
+Q 1756 -1088 1540 -1209 
+Q 1325 -1331 1038 -1331 
+L 575 -1331 
+L 575 -850 
+L 916 -850 
+Q 1169 -850 1312 -703 
+Q 1456 -556 1678 56 
+L 325 3500 
+L 934 3500 
+L 1972 763 
+L 2994 3500 
+L 3603 3500 
+L 2681 1125 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2e" d="M 1528 953 
+L 2316 953 
+L 2316 0 
+L 1528 0 
+L 1528 953 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2b" d="M 2188 3659 
+L 2188 2272 
+L 3578 2272 
+L 3578 1741 
+L 2188 1741 
+L 2188 353 
+L 1663 353 
+L 1663 1741 
+L 275 1741 
+L 275 2272 
+L 1663 2272 
+L 1663 3659 
+L 2188 3659 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-75" d="M 609 1325 
+L 609 3494 
+L 1184 3494 
+L 1184 1325 
+Q 1184 853 1351 631 
+Q 1519 409 1869 409 
+Q 2275 409 2490 695 
+Q 2706 981 2706 1516 
+L 2706 3494 
+L 3284 3494 
+L 3284 0 
+L 2706 0 
+L 2706 525 
+Q 2553 222 2289 65 
+Q 2025 -91 1672 -91 
+Q 1134 -91 871 261 
+Q 609 613 609 1325 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-33" d="M 2425 2497 
+Q 2884 2375 3128 2064 
+Q 3372 1753 3372 1288 
+Q 3372 644 2939 276 
+Q 2506 -91 1741 -91 
+Q 1419 -91 1084 -31 
+Q 750 28 428 141 
+L 428 769 
+Q 747 603 1056 522 
+Q 1366 441 1672 441 
+Q 2191 441 2469 675 
+Q 2747 909 2747 1350 
+Q 2747 1756 2469 1995 
+Q 2191 2234 1716 2234 
+L 1234 2234 
+L 1234 2753 
+L 1716 2753 
+Q 2150 2753 2394 2943 
+Q 2638 3134 2638 3475 
+Q 2638 3834 2411 4026 
+Q 2184 4219 1766 4219 
+Q 1488 4219 1191 4156 
+Q 894 4094 569 3969 
+L 569 4550 
+Q 947 4650 1242 4700 
+Q 1538 4750 1766 4750 
+Q 2447 4750 2855 4408 
+Q 3263 4066 3263 3500 
+Q 3263 3116 3048 2859 
+Q 2834 2603 2425 2497 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-43" d="M 3353 166 
+Q 3113 38 2859 -26 
+Q 2606 -91 2322 -91 
+Q 1425 -91 929 543 
+Q 434 1178 434 2328 
+Q 434 3472 932 4111 
+Q 1431 4750 2322 4750 
+Q 2606 4750 2859 4686 
+Q 3113 4622 3353 4494 
+L 3353 3847 
+Q 3122 4038 2856 4138 
+Q 2591 4238 2322 4238 
+Q 1706 4238 1400 3763 
+Q 1094 3288 1094 2328 
+Q 1094 1372 1400 897 
+Q 1706 422 2322 422 
+Q 2597 422 2861 522 
+Q 3125 622 3353 813 
+L 3353 166 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-55" d="M 459 1791 
+L 459 4666 
+L 1094 4666 
+L 1094 1503 
+Q 1094 1163 1112 1017 
+Q 1131 872 1178 794 
+Q 1278 609 1467 515 
+Q 1656 422 1925 422 
+Q 2197 422 2384 515 
+Q 2572 609 2675 794 
+Q 2722 872 2740 1015 
+Q 2759 1159 2759 1497 
+L 2759 4666 
+L 3391 4666 
+L 3391 1791 
+Q 3391 1075 3302 773 
+Q 3213 472 2994 275 
+Q 2788 91 2522 0 
+Q 2256 -91 1925 -91 
+Q 1597 -91 1331 0 
+Q 1066 91 856 275 
+Q 641 469 550 776 
+Q 459 1084 459 1791 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-50"/>
+    <use xlink:href="#DejaVuSansMono-79" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-54" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-72" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-68" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-32" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-2b" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-75" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-33" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-43" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-50" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-55" transform="translate(1505.126953 0)"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p8be45cf807">
+   <rect x="187.08" y="10.8" width="522.12" height="235.24"/>
+  </clipPath>
+ </defs>
+<g transform="translate(10,259.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
diff --git a/media/benches_light_animation.svg b/media/benches_light_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3d8ecdf4b39de9401684cb1f1d22af8f677bb357
--- /dev/null
+++ b/media/benches_light_animation.svg
@@ -0,0 +1,33 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 220" style="background:#FFFFFF;font-family:DejaVu Sans Mono,monospace">
+<text x="10" y="25" font-size="14" font-weight="bold" fill="#1A1A2E">kernels-community/megablocks vs Torch - Relative Speed</text>
+<text x="790" y="25" font-size="10" fill="#6B7280" text-anchor="end">PyTorch 2.11.0+cu130 · CPU</text>
+<rect x="180" y="60" width="470" height="30" rx="4" fill="#EDEAE3" stroke="#D5D1C8"/>
+<text x="170" y="79" font-size="10" fill="#1A1A2E" text-anchor="end">MegaBlocksMoeBenchmark.base</text>
+<text x="660" y="79" font-size="10" font-weight="bold" fill="#1A1A2E">1.07x</text>
+<circle cx="188" cy="69" r="8" fill="#FF9D00" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="3.753279483037156s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="188" cy="81" r="8" fill="#6B7280" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="4.0s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<rect x="180" y="110" width="470" height="30" rx="4" fill="#EDEAE3" stroke="#D5D1C8"/>
+<text x="170" y="129" font-size="10" fill="#1A1A2E" text-anchor="end">MegaBlocksMoeBenchmark.large</text>
+<text x="660" y="129" font-size="10" font-weight="bold" fill="#1A1A2E">2.87x</text>
+<circle cx="188" cy="119" r="8" fill="#FF9D00" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="1.3949642208783477s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="188" cy="131" r="8" fill="#6B7280" stroke="white" stroke-width="1.5">
+  <animate attributeName="cx" values="188;642;188" dur="4.0s" repeatCount="indefinite" calcMode="spline" keySplines="0.5 0 0.5 1;0.5 0 0.5 1"/>
+</circle>
+<circle cx="650" cy="200" r="6" fill="#FF9D00" stroke="white"/>
+<text x="662" y="204" font-size="9" fill="#1A1A2E">Kernel</text>
+<circle cx="730" cy="200" r="6" fill="#6B7280" stroke="white"/>
+<text x="742" y="204" font-size="9" fill="#1A1A2E">Torch (ref)</text>
+<g transform="translate(10,191.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
\ No newline at end of file
diff --git a/media/benches_light_latency.svg b/media/benches_light_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..100e7c4fc842dbed4d92580943ddc749602c1721
--- /dev/null
+++ b/media/benches_light_latency.svg
@@ -0,0 +1,1969 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="288pt" viewBox="0 0 720 288" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-03-27T20:23:31.641938</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 288 
+L 720 288 
+L 720 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+L 709.2 10.8 
+L 187.08 10.8 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 187.08 78.657692 
+L 254.984533 78.657692 
+L 254.984533 60.562308 
+L 187.08 60.562308 
+z
+" clip-path="url(#p695fbbea17)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 187.08 105.800769 
+L 259.448213 105.800769 
+L 259.448213 87.705385 
+L 187.08 87.705385 
+z
+" clip-path="url(#p695fbbea17)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 187.08 169.134615 
+L 308.469787 169.134615 
+L 308.469787 151.039231 
+L 187.08 151.039231 
+z
+" clip-path="url(#p695fbbea17)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 187.08 196.277692 
+L 535.16 196.277692 
+L 535.16 178.182308 
+L 187.08 178.182308 
+z
+" clip-path="url(#p695fbbea17)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m985cdf007c" d="M 0 0 
+L 0 3.5 
+" style="stroke: #6b7280; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m985cdf007c" x="187.08" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <g style="fill: #6b7280" transform="translate(184.069844 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-30" d="M 1509 2344 
+Q 1509 2516 1629 2641 
+Q 1750 2766 1919 2766 
+Q 2094 2766 2219 2641 
+Q 2344 2516 2344 2344 
+Q 2344 2169 2220 2047 
+Q 2097 1925 1919 1925 
+Q 1744 1925 1626 2044 
+Q 1509 2163 1509 2344 
+z
+M 1925 4250 
+Q 1484 4250 1267 3775 
+Q 1050 3300 1050 2328 
+Q 1050 1359 1267 884 
+Q 1484 409 1925 409 
+Q 2369 409 2586 884 
+Q 2803 1359 2803 2328 
+Q 2803 3300 2586 3775 
+Q 2369 4250 1925 4250 
+z
+M 1925 4750 
+Q 2672 4750 3055 4137 
+Q 3438 3525 3438 2328 
+Q 3438 1134 3055 521 
+Q 2672 -91 1925 -91 
+Q 1178 -91 797 521 
+Q 416 1134 416 2328 
+Q 416 3525 797 4137 
+Q 1178 4750 1925 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m985cdf007c" x="245.535746" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 10 -->
+      <g style="fill: #6b7280" transform="translate(239.515433 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-31" d="M 844 531 
+L 1825 531 
+L 1825 4097 
+L 769 3859 
+L 769 4434 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3419 531 
+L 3419 0 
+L 844 0 
+L 844 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m985cdf007c" x="303.991492" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 20 -->
+      <g style="fill: #6b7280" transform="translate(297.971179 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-32" d="M 1166 531 
+L 3309 531 
+L 3309 0 
+L 475 0 
+L 475 531 
+Q 1059 1147 1496 1619 
+Q 1934 2091 2100 2284 
+Q 2413 2666 2522 2902 
+Q 2631 3138 2631 3384 
+Q 2631 3775 2401 3997 
+Q 2172 4219 1772 4219 
+Q 1488 4219 1175 4116 
+Q 863 4013 513 3803 
+L 513 4441 
+Q 834 4594 1145 4672 
+Q 1456 4750 1759 4750 
+Q 2444 4750 2861 4386 
+Q 3278 4022 3278 3431 
+Q 3278 3131 3139 2831 
+Q 3000 2531 2688 2169 
+Q 2513 1966 2180 1606 
+Q 1847 1247 1166 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-32"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m985cdf007c" x="362.447238" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 30 -->
+      <g style="fill: #6b7280" transform="translate(356.426925 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-33" d="M 2425 2497 
+Q 2884 2375 3128 2064 
+Q 3372 1753 3372 1288 
+Q 3372 644 2939 276 
+Q 2506 -91 1741 -91 
+Q 1419 -91 1084 -31 
+Q 750 28 428 141 
+L 428 769 
+Q 747 603 1056 522 
+Q 1366 441 1672 441 
+Q 2191 441 2469 675 
+Q 2747 909 2747 1350 
+Q 2747 1756 2469 1995 
+Q 2191 2234 1716 2234 
+L 1234 2234 
+L 1234 2753 
+L 1716 2753 
+Q 2150 2753 2394 2943 
+Q 2638 3134 2638 3475 
+Q 2638 3834 2411 4026 
+Q 2184 4219 1766 4219 
+Q 1488 4219 1191 4156 
+Q 894 4094 569 3969 
+L 569 4550 
+Q 947 4650 1242 4700 
+Q 1538 4750 1766 4750 
+Q 2447 4750 2855 4408 
+Q 3263 4066 3263 3500 
+Q 3263 3116 3048 2859 
+Q 2834 2603 2425 2497 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-33"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m985cdf007c" x="420.902984" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 40 -->
+      <g style="fill: #6b7280" transform="translate(414.882671 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-34" d="M 2297 4091 
+L 825 1625 
+L 2297 1625 
+L 2297 4091 
+z
+M 2194 4666 
+L 2925 4666 
+L 2925 1625 
+L 3547 1625 
+L 3547 1113 
+L 2925 1113 
+L 2925 0 
+L 2297 0 
+L 2297 1113 
+L 319 1113 
+L 319 1709 
+L 2194 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-34"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m985cdf007c" x="479.35873" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 50 -->
+      <g style="fill: #6b7280" transform="translate(473.338417 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-35" d="M 647 4666 
+L 3009 4666 
+L 3009 4134 
+L 1222 4134 
+L 1222 2988 
+Q 1356 3038 1492 3061 
+Q 1628 3084 1766 3084 
+Q 2491 3084 2916 2656 
+Q 3341 2228 3341 1497 
+Q 3341 759 2895 334 
+Q 2450 -91 1678 -91 
+Q 1306 -91 998 -41 
+Q 691 9 447 109 
+L 447 750 
+Q 734 594 1025 517 
+Q 1316 441 1619 441 
+Q 2141 441 2423 716 
+Q 2706 991 2706 1497 
+Q 2706 1997 2414 2275 
+Q 2122 2553 1600 2553 
+Q 1347 2553 1106 2495 
+Q 866 2438 647 2322 
+L 647 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-35"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m985cdf007c" x="537.814475" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 60 -->
+      <g style="fill: #6b7280" transform="translate(531.794163 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-36" d="M 3097 4563 
+L 3097 3981 
+Q 2900 4097 2678 4158 
+Q 2456 4219 2216 4219 
+Q 1616 4219 1306 3767 
+Q 997 3316 997 2438 
+Q 1147 2750 1412 2917 
+Q 1678 3084 2022 3084 
+Q 2697 3084 3067 2670 
+Q 3438 2256 3438 1497 
+Q 3438 741 3056 325 
+Q 2675 -91 1984 -91 
+Q 1172 -91 794 492 
+Q 416 1075 416 2328 
+Q 416 3509 870 4129 
+Q 1325 4750 2188 4750 
+Q 2419 4750 2650 4701 
+Q 2881 4653 3097 4563 
+z
+M 1972 2591 
+Q 1569 2591 1337 2300 
+Q 1106 2009 1106 1497 
+Q 1106 984 1337 693 
+Q 1569 403 1972 403 
+Q 2391 403 2603 679 
+Q 2816 956 2816 1497 
+Q 2816 2041 2603 2316 
+Q 2391 2591 1972 2591 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-36"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_8">
+     <g id="line2d_8">
+      <g>
+       <use xlink:href="#m985cdf007c" x="596.270221" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_8">
+      <!-- 70 -->
+      <g style="fill: #6b7280" transform="translate(590.249909 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-37" d="M 434 4666 
+L 3372 4666 
+L 3372 4397 
+L 1703 0 
+L 1044 0 
+L 2669 4134 
+L 434 4134 
+L 434 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-37"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_9">
+     <g id="line2d_9">
+      <g>
+       <use xlink:href="#m985cdf007c" x="654.725967" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_9">
+      <!-- 80 -->
+      <g style="fill: #6b7280" transform="translate(648.705655 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-38" d="M 1925 2216 
+Q 1503 2216 1273 1980 
+Q 1044 1744 1044 1313 
+Q 1044 881 1276 642 
+Q 1509 403 1925 403 
+Q 2350 403 2579 639 
+Q 2809 875 2809 1313 
+Q 2809 1741 2576 1978 
+Q 2344 2216 1925 2216 
+z
+M 1375 2478 
+Q 972 2581 745 2862 
+Q 519 3144 519 3541 
+Q 519 4097 897 4423 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4423 
+Q 3334 4097 3334 3541 
+Q 3334 3144 3107 2862 
+Q 2881 2581 2478 2478 
+Q 2947 2375 3195 2062 
+Q 3444 1750 3444 1253 
+Q 3444 622 3041 265 
+Q 2638 -91 1925 -91 
+Q 1213 -91 811 264 
+Q 409 619 409 1247 
+Q 409 1747 657 2061 
+Q 906 2375 1375 2478 
+z
+M 1147 3481 
+Q 1147 3106 1347 2909 
+Q 1547 2713 1925 2713 
+Q 2306 2713 2506 2909 
+Q 2706 3106 2706 3481 
+Q 2706 3863 2507 4063 
+Q 2309 4263 1925 4263 
+Q 1547 4263 1347 4061 
+Q 1147 3859 1147 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-38"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_10">
+     <!-- Time (ms)  &lt;-  shorter is better -->
+     <g style="fill: #1a1a2e" transform="translate(351.815 274.422813) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSansMono-54" d="M 147 4666 
+L 3706 4666 
+L 3706 4134 
+L 2247 4134 
+L 2247 0 
+L 1613 0 
+L 1613 4134 
+L 147 4134 
+L 147 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-69" d="M 800 3500 
+L 2272 3500 
+L 2272 447 
+L 3413 447 
+L 3413 0 
+L 556 0 
+L 556 447 
+L 1697 447 
+L 1697 3053 
+L 800 3053 
+L 800 3500 
+z
+M 1697 4863 
+L 2272 4863 
+L 2272 4134 
+L 1697 4134 
+L 1697 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6d" d="M 2113 3144 
+Q 2219 3369 2383 3476 
+Q 2547 3584 2778 3584 
+Q 3200 3584 3373 3257 
+Q 3547 2931 3547 2028 
+L 3547 0 
+L 3022 0 
+L 3022 2003 
+Q 3022 2744 2939 2923 
+Q 2856 3103 2638 3103 
+Q 2388 3103 2295 2911 
+Q 2203 2719 2203 2003 
+L 2203 0 
+L 1678 0 
+L 1678 2003 
+Q 1678 2753 1589 2928 
+Q 1500 3103 1269 3103 
+Q 1041 3103 952 2911 
+Q 863 2719 863 2003 
+L 863 0 
+L 341 0 
+L 341 3500 
+L 863 3500 
+L 863 3200 
+Q 966 3388 1120 3486 
+Q 1275 3584 1472 3584 
+Q 1709 3584 1867 3475 
+Q 2025 3366 2113 3144 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-65" d="M 3475 1894 
+L 3475 1613 
+L 984 1613 
+L 984 1594 
+Q 984 1022 1282 709 
+Q 1581 397 2125 397 
+Q 2400 397 2700 484 
+Q 3000 572 3341 750 
+L 3341 178 
+Q 3013 44 2708 -23 
+Q 2403 -91 2119 -91 
+Q 1303 -91 843 398 
+Q 384 888 384 1747 
+Q 384 2584 834 3084 
+Q 1284 3584 2034 3584 
+Q 2703 3584 3089 3131 
+Q 3475 2678 3475 1894 
+z
+M 2900 2063 
+Q 2888 2569 2661 2833 
+Q 2434 3097 2009 3097 
+Q 1594 3097 1325 2822 
+Q 1056 2547 1006 2059 
+L 2900 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-28" d="M 2766 4856 
+Q 2350 4144 2145 3436 
+Q 1941 2728 1941 2009 
+Q 1941 1294 2145 584 
+Q 2350 -125 2766 -844 
+L 2266 -844 
+Q 1794 -100 1562 604 
+Q 1331 1309 1331 2009 
+Q 1331 2706 1562 3412 
+Q 1794 4119 2266 4856 
+L 2766 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-73" d="M 3041 3378 
+L 3041 2816 
+Q 2794 2959 2544 3031 
+Q 2294 3103 2034 3103 
+Q 1644 3103 1451 2976 
+Q 1259 2850 1259 2591 
+Q 1259 2356 1403 2240 
+Q 1547 2125 2119 2016 
+L 2350 1972 
+Q 2778 1891 2998 1647 
+Q 3219 1403 3219 1013 
+Q 3219 494 2850 201 
+Q 2481 -91 1825 -91 
+Q 1566 -91 1281 -36 
+Q 997 19 666 128 
+L 666 722 
+Q 988 556 1281 473 
+Q 1575 391 1838 391 
+Q 2219 391 2428 545 
+Q 2638 700 2638 978 
+Q 2638 1378 1872 1531 
+L 1847 1538 
+L 1631 1581 
+Q 1134 1678 906 1908 
+Q 678 2138 678 2534 
+Q 678 3038 1018 3311 
+Q 1359 3584 1991 3584 
+Q 2272 3584 2531 3532 
+Q 2791 3481 3041 3378 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-29" d="M 1088 4856 
+L 1588 4856 
+Q 2059 4119 2290 3412 
+Q 2522 2706 2522 2009 
+Q 2522 1306 2290 600 
+Q 2059 -106 1588 -844 
+L 1088 -844 
+Q 1503 -119 1708 590 
+Q 1913 1300 1913 2009 
+Q 1913 2722 1708 3431 
+Q 1503 4141 1088 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-3c" d="M 3578 3003 
+L 922 2003 
+L 3578 1013 
+L 3578 441 
+L 275 1747 
+L 275 2266 
+L 3578 3572 
+L 3578 3003 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-2d" d="M 1113 2009 
+L 2741 2009 
+L 2741 1497 
+L 1113 1497 
+L 1113 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-68" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 4863 
+L 1184 4863 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6f" d="M 1925 3097 
+Q 1488 3097 1263 2756 
+Q 1038 2416 1038 1747 
+Q 1038 1081 1263 739 
+Q 1488 397 1925 397 
+Q 2366 397 2591 739 
+Q 2816 1081 2816 1747 
+Q 2816 2416 2591 2756 
+Q 2366 3097 1925 3097 
+z
+M 1925 3584 
+Q 2653 3584 3039 3112 
+Q 3425 2641 3425 1747 
+Q 3425 850 3040 379 
+Q 2656 -91 1925 -91 
+Q 1197 -91 812 379 
+Q 428 850 428 1747 
+Q 428 2641 812 3112 
+Q 1197 3584 1925 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-72" d="M 3609 2778 
+Q 3425 2922 3234 2987 
+Q 3044 3053 2816 3053 
+Q 2278 3053 1993 2715 
+Q 1709 2378 1709 1741 
+L 1709 0 
+L 1131 0 
+L 1131 3500 
+L 1709 3500 
+L 1709 2816 
+Q 1853 3188 2151 3386 
+Q 2450 3584 2859 3584 
+Q 3072 3584 3256 3531 
+Q 3441 3478 3609 3366 
+L 3609 2778 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-74" d="M 1919 4494 
+L 1919 3500 
+L 3225 3500 
+L 3225 3053 
+L 1919 3053 
+L 1919 1153 
+Q 1919 766 2066 612 
+Q 2213 459 2578 459 
+L 3225 459 
+L 3225 0 
+L 2522 0 
+Q 1875 0 1609 259 
+Q 1344 519 1344 1153 
+L 1344 3053 
+L 409 3053 
+L 409 3500 
+L 1344 3500 
+L 1344 4494 
+L 1919 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-62" d="M 2869 1747 
+Q 2869 2416 2656 2756 
+Q 2444 3097 2028 3097 
+Q 1609 3097 1393 2755 
+Q 1178 2413 1178 1747 
+Q 1178 1084 1393 740 
+Q 1609 397 2028 397 
+Q 2444 397 2656 737 
+Q 2869 1078 2869 1747 
+z
+M 1178 3053 
+Q 1316 3309 1558 3446 
+Q 1800 3584 2119 3584 
+Q 2750 3584 3112 3098 
+Q 3475 2613 3475 1759 
+Q 3475 894 3111 401 
+Q 2747 -91 2113 -91 
+Q 1800 -91 1561 45 
+Q 1322 181 1178 441 
+L 1178 0 
+L 603 0 
+L 603 4863 
+L 1178 4863 
+L 1178 3053 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-6d" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(602.050781 0)"/>
+      <use xlink:href="#DejaVuSansMono-3c" transform="translate(662.255859 0)"/>
+      <use xlink:href="#DejaVuSansMono-2d" transform="translate(722.460938 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(782.666016 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(842.871094 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(903.076172 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(963.28125 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1023.486328 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1083.691406 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1143.896484 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1204.101562 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1264.306641 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(1384.716797 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(1444.921875 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1505.126953 0)"/>
+      <use xlink:href="#DejaVuSansMono-62" transform="translate(1565.332031 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1625.537109 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1685.742188 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(1745.947266 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1806.152344 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1866.357422 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2"/>
+   <g id="patch_7">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+" style="fill: none; stroke: #6b7280; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_11">
+    <!-- 11.62 ms -->
+    <g style="fill: #1a1a2e" transform="translate(261.946133 72.117344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-2e" d="M 1528 953 
+L 2316 953 
+L 2316 0 
+L 1528 0 
+L 1528 953 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-31" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-36" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-32" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_12">
+    <!-- 12.38 ms -->
+    <g style="fill: #1a1a2e" transform="translate(266.409813 99.260421) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-32" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-33" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_13">
+    <!-- MegaBlocksMoeBenchmark.base -->
+    <g style="fill: #1a1a2e" transform="translate(17.569963 85.940913) scale(0.1 -0.1)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-4d" d="M 269 4666 
+L 1369 4666 
+L 1925 2619 
+L 2478 4666 
+L 3584 4666 
+L 3584 0 
+L 2791 0 
+L 2791 3738 
+L 2297 1697 
+L 1563 1697 
+L 1063 3738 
+L 1063 0 
+L 269 0 
+L 269 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-65" d="M 3444 172 
+Q 3125 41 2794 -25 
+Q 2463 -91 2094 -91 
+Q 1216 -91 752 379 
+Q 288 850 288 1734 
+Q 288 2591 734 3087 
+Q 1181 3584 1953 3584 
+Q 2731 3584 3161 3123 
+Q 3591 2663 3591 1825 
+L 3591 1453 
+L 1219 1453 
+Q 1222 1041 1462 837 
+Q 1703 634 2181 634 
+Q 2497 634 2803 725 
+Q 3109 816 3444 1013 
+L 3444 172 
+z
+M 2669 2141 
+Q 2663 2503 2483 2689 
+Q 2303 2875 1953 2875 
+Q 1638 2875 1450 2683 
+Q 1263 2491 1228 2138 
+L 2669 2141 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-67" d="M 2509 1806 
+Q 2509 2275 2331 2558 
+Q 2153 2841 1863 2841 
+Q 1575 2841 1398 2559 
+Q 1222 2278 1222 1806 
+Q 1222 1331 1398 1050 
+Q 1575 769 1863 769 
+Q 2153 769 2331 1051 
+Q 2509 1334 2509 1806 
+z
+M 3425 263 
+Q 3425 -578 3045 -951 
+Q 2666 -1325 1813 -1325 
+Q 1525 -1325 1244 -1283 
+Q 963 -1241 672 -1153 
+L 672 -313 
+Q 931 -456 1200 -525 
+Q 1469 -594 1753 -594 
+Q 2141 -594 2325 -411 
+Q 2509 -228 2509 159 
+L 2509 538 
+Q 2375 288 2153 166 
+Q 1931 44 1613 44 
+Q 1013 44 659 512 
+Q 306 981 306 1784 
+Q 306 2616 659 3103 
+Q 1013 3591 1606 3591 
+Q 1906 3591 2140 3450 
+Q 2375 3309 2509 3053 
+L 2509 3500 
+L 3425 3500 
+L 3425 263 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-61" d="M 2188 1644 
+Q 1675 1644 1472 1512 
+Q 1269 1381 1269 1063 
+Q 1269 825 1409 684 
+Q 1550 544 1791 544 
+Q 2153 544 2353 817 
+Q 2553 1091 2553 1581 
+L 2553 1644 
+L 2188 1644 
+z
+M 3463 1997 
+L 3463 0 
+L 2553 0 
+L 2553 391 
+Q 2388 159 2128 34 
+Q 1869 -91 1556 -91 
+Q 959 -91 626 225 
+Q 294 541 294 1106 
+Q 294 1719 691 2011 
+Q 1088 2303 1919 2303 
+L 2553 2303 
+L 2553 2456 
+Q 2553 2678 2392 2792 
+Q 2231 2906 1919 2906 
+Q 1591 2906 1283 2823 
+Q 975 2741 641 2559 
+L 641 3341 
+Q 944 3466 1256 3525 
+Q 1569 3584 1919 3584 
+Q 2772 3584 3117 3237 
+Q 3463 2891 3463 1997 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-42" d="M 1281 2119 
+L 1281 738 
+L 1894 738 
+Q 2334 738 2511 888 
+Q 2688 1038 2688 1409 
+Q 2688 1788 2503 1953 
+Q 2319 2119 1894 2119 
+L 1281 2119 
+z
+M 1281 3938 
+L 1281 2853 
+L 1894 2853 
+Q 2244 2853 2392 2978 
+Q 2541 3103 2541 3391 
+Q 2541 3678 2389 3808 
+Q 2238 3938 1894 3938 
+L 1281 3938 
+z
+M 391 4672 
+L 1894 4672 
+Q 2672 4672 3064 4376 
+Q 3456 4081 3456 3494 
+Q 3456 3044 3225 2791 
+Q 2994 2538 2547 2497 
+Q 3081 2444 3351 2137 
+Q 3622 1831 3622 1281 
+Q 3622 606 3215 303 
+Q 2809 0 1894 0 
+L 391 0 
+L 391 4672 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6c" d="M 1209 1459 
+L 1209 4159 
+L 281 4159 
+L 281 4863 
+L 2125 4863 
+L 2125 1459 
+Q 2125 1047 2253 875 
+Q 2381 703 2688 703 
+L 3419 703 
+L 3419 0 
+L 2431 0 
+Q 1778 0 1493 337 
+Q 1209 675 1209 1459 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6f" d="M 1925 2841 
+Q 1597 2841 1409 2552 
+Q 1222 2263 1222 1747 
+Q 1222 1231 1409 942 
+Q 1597 653 1925 653 
+Q 2256 653 2443 942 
+Q 2631 1231 2631 1747 
+Q 2631 2263 2443 2552 
+Q 2256 2841 1925 2841 
+z
+M 306 1747 
+Q 306 2594 745 3089 
+Q 1184 3584 1925 3584 
+Q 2669 3584 3108 3089 
+Q 3547 2594 3547 1747 
+Q 3547 900 3108 404 
+Q 2669 -91 1925 -91 
+Q 1184 -91 745 404 
+Q 306 900 306 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-63" d="M 3316 178 
+Q 3084 44 2818 -23 
+Q 2553 -91 2247 -91 
+Q 1438 -91 981 396 
+Q 525 884 525 1747 
+Q 525 2613 984 3102 
+Q 1444 3591 2253 3591 
+Q 2534 3591 2795 3525 
+Q 3056 3459 3316 3322 
+L 3316 2484 
+Q 3116 2656 2877 2748 
+Q 2638 2841 2381 2841 
+Q 1934 2841 1693 2556 
+Q 1453 2272 1453 1747 
+Q 1453 1222 1693 940 
+Q 1934 659 2381 659 
+Q 2647 659 2878 748 
+Q 3109 838 3316 1019 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6b" d="M 544 4863 
+L 1459 4863 
+L 1459 2309 
+L 2559 3500 
+L 3669 3500 
+L 2344 2194 
+L 3744 0 
+L 2731 0 
+L 1772 1638 
+L 1459 1338 
+L 1459 0 
+L 544 0 
+L 544 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-73" d="M 3097 3391 
+L 3097 2591 
+Q 2844 2753 2570 2836 
+Q 2297 2919 2022 2919 
+Q 1716 2919 1559 2830 
+Q 1403 2741 1403 2566 
+Q 1403 2316 2072 2159 
+L 2106 2150 
+L 2369 2088 
+Q 2869 1969 3101 1705 
+Q 3334 1441 3334 991 
+Q 3334 450 2979 179 
+Q 2625 -91 1913 -91 
+Q 1597 -91 1265 -36 
+Q 934 19 594 128 
+L 594 928 
+Q 897 756 1211 665 
+Q 1525 575 1819 575 
+Q 2141 575 2306 669 
+Q 2472 763 2472 941 
+Q 2472 1116 2355 1209 
+Q 2238 1303 1797 1409 
+L 1544 1466 
+Q 1019 1584 778 1837 
+Q 538 2091 538 2516 
+Q 538 3022 903 3303 
+Q 1269 3584 1931 3584 
+Q 2228 3584 2523 3536 
+Q 2819 3488 3097 3391 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6e" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2513 2330 2677 
+Q 2222 2841 1978 2841 
+Q 1731 2841 1589 2619 
+Q 1447 2397 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 3500 
+L 1447 3500 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-68" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2509 2330 2671 
+Q 2222 2834 1978 2834 
+Q 1728 2834 1587 2614 
+Q 1447 2394 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 4863 
+L 1447 4863 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6d" d="M 2156 3150 
+Q 2259 3372 2418 3478 
+Q 2578 3584 2809 3584 
+Q 3263 3584 3436 3271 
+Q 3609 2959 3609 1972 
+L 3609 0 
+L 2859 0 
+L 2859 2247 
+Q 2859 2638 2800 2769 
+Q 2741 2900 2584 2900 
+Q 2428 2900 2365 2765 
+Q 2303 2631 2303 2247 
+L 2303 0 
+L 1563 0 
+L 1563 2247 
+Q 1563 2631 1500 2765 
+Q 1438 2900 1281 2900 
+Q 1125 2900 1065 2769 
+Q 1006 2638 1006 2247 
+L 1006 0 
+L 256 0 
+L 256 3500 
+L 922 3500 
+L 922 3138 
+Q 1000 3344 1172 3464 
+Q 1344 3584 1556 3584 
+Q 1769 3584 1944 3457 
+Q 2119 3331 2156 3150 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-72" d="M 3597 2534 
+Q 3447 2672 3245 2740 
+Q 3044 2809 2803 2809 
+Q 2513 2809 2295 2707 
+Q 2078 2606 1959 2413 
+Q 1884 2294 1854 2125 
+Q 1825 1956 1825 1613 
+L 1825 0 
+L 909 0 
+L 909 3500 
+L 1825 3500 
+L 1825 2956 
+Q 1959 3256 2237 3420 
+Q 2516 3584 2888 3584 
+Q 3075 3584 3254 3539 
+Q 3434 3494 3597 3406 
+L 3597 2534 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-2e" d="M 1403 1147 
+L 2444 1147 
+L 2444 0 
+L 1403 0 
+L 1403 1147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-62" d="M 2656 1741 
+Q 2656 2247 2487 2534 
+Q 2319 2822 2022 2822 
+Q 1725 2822 1553 2534 
+Q 1381 2247 1381 1741 
+Q 1381 1234 1553 946 
+Q 1725 659 2022 659 
+Q 2319 659 2487 946 
+Q 2656 1234 2656 1741 
+z
+M 1381 3003 
+Q 1550 3294 1773 3439 
+Q 1997 3584 2278 3584 
+Q 2900 3584 3236 3109 
+Q 3572 2634 3572 1747 
+Q 3572 872 3240 390 
+Q 2909 -91 2309 -91 
+Q 1994 -91 1759 62 
+Q 1525 216 1381 519 
+L 1381 0 
+L 469 0 
+L 469 4863 
+L 1381 4863 
+L 1381 3003 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1565.332031 0)"/>
+    </g>
+   </g>
+   <g id="text_14">
+    <!--   1.07x faster -->
+    <g style="fill: #ff9d00" transform="translate(311.660213 85.664976) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-20" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-31" d="M 588 813 
+L 1619 813 
+L 1619 3841 
+L 659 3603 
+L 659 4434 
+L 1625 4666 
+L 2516 4666 
+L 2516 813 
+L 3547 813 
+L 3547 0 
+L 588 0 
+L 588 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-30" d="M 1538 2328 
+Q 1538 2494 1650 2606 
+Q 1763 2719 1925 2719 
+Q 2091 2719 2203 2606 
+Q 2316 2494 2316 2328 
+Q 2316 2166 2203 2053 
+Q 2091 1941 1925 1941 
+Q 1763 1941 1650 2052 
+Q 1538 2163 1538 2328 
+z
+M 1925 3969 
+Q 1606 3969 1459 3578 
+Q 1313 3188 1313 2328 
+Q 1313 1472 1459 1081 
+Q 1606 691 1925 691 
+Q 2247 691 2394 1081 
+Q 2541 1472 2541 2328 
+Q 2541 3188 2394 3578 
+Q 2247 3969 1925 3969 
+z
+M 384 2328 
+Q 384 3541 768 4145 
+Q 1153 4750 1925 4750 
+Q 2700 4750 3084 4147 
+Q 3469 3544 3469 2328 
+Q 3469 1116 3084 512 
+Q 2700 -91 1925 -91 
+Q 1153 -91 768 514 
+Q 384 1119 384 2328 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-37" d="M 422 4666 
+L 3372 4666 
+L 3372 4013 
+L 1791 0 
+L 850 0 
+L 2369 3853 
+L 422 3853 
+L 422 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-78" d="M 3578 3500 
+L 2400 1825 
+L 3681 0 
+L 2613 0 
+L 1925 1178 
+L 1241 0 
+L 172 0 
+L 1466 1825 
+L 275 3500 
+L 1344 3500 
+L 1925 2456 
+L 2509 3500 
+L 3578 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-66" d="M 2309 3806 
+L 2309 3500 
+L 3384 3500 
+L 3384 2797 
+L 2309 2797 
+L 2309 0 
+L 1394 0 
+L 1394 2797 
+L 544 2797 
+L 544 3500 
+L 1394 3500 
+L 1394 3744 
+Q 1394 4375 1656 4619 
+Q 1919 4863 2631 4863 
+L 3384 4863 
+L 3384 4159 
+L 2669 4159 
+Q 2463 4159 2389 4084 
+Q 2316 4009 2309 3806 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-74" d="M 2156 4494 
+L 2156 3500 
+L 3353 3500 
+L 3353 2797 
+L 2156 2797 
+L 2156 1141 
+Q 2156 906 2270 804 
+Q 2384 703 2650 703 
+L 3353 703 
+L 3353 0 
+L 2584 0 
+Q 1797 0 1519 251 
+Q 1241 503 1241 1184 
+L 1241 2797 
+L 347 2797 
+L 347 3500 
+L 1241 3500 
+L 1241 4494 
+L 2156 4494 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-31" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-30" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="text_15">
+    <!-- 20.77 ms -->
+    <g style="fill: #1a1a2e" transform="translate(315.431387 162.594267) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-32"/>
+     <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_16">
+    <!-- 59.55 ms -->
+    <g style="fill: #1a1a2e" transform="translate(542.1216 189.737344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-39" d="M 1863 2069 
+Q 2266 2069 2495 2359 
+Q 2725 2650 2725 3163 
+Q 2725 3675 2495 3965 
+Q 2266 4256 1863 4256 
+Q 1444 4256 1231 3979 
+Q 1019 3703 1019 3163 
+Q 1019 2619 1230 2344 
+Q 1441 2069 1863 2069 
+z
+M 738 97 
+L 738 678 
+Q 934 563 1156 502 
+Q 1378 441 1619 441 
+Q 2219 441 2526 892 
+Q 2834 1344 2834 2222 
+Q 2688 1909 2422 1742 
+Q 2156 1575 1813 1575 
+Q 1138 1575 767 1990 
+Q 397 2406 397 3169 
+Q 397 3922 776 4336 
+Q 1156 4750 1850 4750 
+Q 2663 4750 3041 4165 
+Q 3419 3581 3419 2328 
+Q 3419 1150 2964 529 
+Q 2509 -91 1644 -91 
+Q 1416 -91 1184 -42 
+Q 953 6 738 97 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-35"/>
+     <use xlink:href="#DejaVuSansMono-39" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-2e" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-35" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-35" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-6d" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_17">
+    <!-- MegaBlocksMoeBenchmark.large -->
+    <g style="fill: #1a1a2e" transform="translate(11.54965 176.417837) scale(0.1 -0.1)">
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1565.332031 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1625.537109 0)"/>
+    </g>
+   </g>
+   <g id="text_18">
+    <!--   2.87x faster -->
+    <g style="fill: #ff9d00" transform="translate(587.372 176.141899) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-32" d="M 1356 813 
+L 3322 813 
+L 3322 0 
+L 359 0 
+L 359 788 
+L 859 1319 
+Q 1750 2266 1941 2484 
+Q 2175 2753 2278 2961 
+Q 2381 3169 2381 3372 
+Q 2381 3684 2192 3854 
+Q 2003 4025 1656 4025 
+Q 1409 4025 1101 3926 
+Q 794 3828 459 3641 
+L 459 4500 
+Q 794 4622 1114 4686 
+Q 1434 4750 1728 4750 
+Q 2469 4750 2892 4404 
+Q 3316 4059 3316 3463 
+Q 3316 3188 3223 2947 
+Q 3131 2706 2906 2413 
+Q 2741 2200 1997 1456 
+Q 1594 1053 1356 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-38" d="M 1925 2081 
+Q 1616 2081 1425 1886 
+Q 1234 1691 1234 1369 
+Q 1234 1047 1425 850 
+Q 1616 653 1925 653 
+Q 2234 653 2426 853 
+Q 2619 1053 2619 1369 
+Q 2619 1691 2428 1886 
+Q 2238 2081 1925 2081 
+z
+M 1241 2472 
+Q 888 2594 703 2855 
+Q 519 3116 519 3494 
+Q 519 4075 897 4412 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4412 
+Q 3334 4075 3334 3494 
+Q 3334 3119 3153 2858 
+Q 2972 2597 2625 2472 
+Q 3013 2353 3231 2047 
+Q 3450 1741 3450 1313 
+Q 3450 641 3053 275 
+Q 2656 -91 1925 -91 
+Q 1197 -91 800 275 
+Q 403 641 403 1313 
+Q 403 1744 625 2050 
+Q 847 2356 1241 2472 
+z
+M 1338 3419 
+Q 1338 3144 1495 2983 
+Q 1653 2822 1925 2822 
+Q 2200 2822 2358 2983 
+Q 2516 3144 2516 3419 
+Q 2516 3691 2358 3848 
+Q 2200 4006 1925 4006 
+Q 1656 4006 1497 3847 
+Q 1338 3688 1338 3419 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-32" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="legend_1">
+    <g id="patch_8">
+     <path d="M 614.498906 56.27825 
+L 702.9 56.27825 
+Q 704.7 56.27825 704.7 54.47825 
+L 704.7 28.862 
+Q 704.7 27.062 702.9 27.062 
+L 614.498906 27.062 
+Q 612.698906 27.062 612.698906 28.862 
+L 612.698906 54.47825 
+Q 612.698906 56.27825 614.498906 56.27825 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #6b7280; stroke-linejoin: miter"/>
+    </g>
+    <g id="patch_9">
+     <path d="M 616.298906 37.548406 
+L 634.298906 37.548406 
+L 634.298906 31.248406 
+L 616.298906 31.248406 
+z
+" style="fill: #ff9d00; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_19">
+     <!-- Kernel -->
+     <g style="fill: #1a1a2e" transform="translate(641.498906 37.548406) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-4b" d="M 428 4666 
+L 1063 4666 
+L 1063 2591 
+L 3034 4666 
+L 3775 4666 
+L 1959 2759 
+L 3828 0 
+L 3066 0 
+L 1544 2338 
+L 1063 1825 
+L 1063 0 
+L 428 0 
+L 428 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6e" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 3500 
+L 1184 3500 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6c" d="M 1997 1269 
+Q 1997 881 2139 684 
+Q 2281 488 2559 488 
+L 3231 488 
+L 3231 0 
+L 2503 0 
+Q 1988 0 1705 331 
+Q 1422 663 1422 1269 
+L 1422 4447 
+L 500 4447 
+L 500 4897 
+L 1997 4897 
+L 1997 1269 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4b"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(301.025391 0)"/>
+     </g>
+    </g>
+    <g id="patch_10">
+     <path d="M 616.298906 50.806531 
+L 634.298906 50.806531 
+L 634.298906 44.506531 
+L 616.298906 44.506531 
+z
+" style="fill: #6b7280; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_20">
+     <!-- Torch (ref) -->
+     <g style="fill: #1a1a2e" transform="translate(641.498906 50.806531) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-63" d="M 3316 178 
+Q 3084 44 2839 -23 
+Q 2594 -91 2338 -91 
+Q 1525 -91 1067 396 
+Q 609 884 609 1747 
+Q 609 2609 1067 3096 
+Q 1525 3584 2338 3584 
+Q 2591 3584 2831 3518 
+Q 3072 3453 3316 3316 
+L 3316 2713 
+Q 3088 2916 2858 3006 
+Q 2628 3097 2338 3097 
+Q 1797 3097 1506 2747 
+Q 1216 2397 1216 1747 
+Q 1216 1100 1508 748 
+Q 1800 397 2338 397 
+Q 2638 397 2875 489 
+Q 3113 581 3316 775 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-66" d="M 3322 4863 
+L 3322 4384 
+L 2669 4384 
+Q 2359 4384 2239 4257 
+Q 2119 4131 2119 3809 
+L 2119 3500 
+L 3322 3500 
+L 3322 3053 
+L 2119 3053 
+L 2119 0 
+L 1544 0 
+L 1544 3053 
+L 609 3053 
+L 609 3500 
+L 1544 3500 
+L 1544 3744 
+Q 1544 4319 1808 4591 
+Q 2072 4863 2631 4863 
+L 3322 4863 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-66" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(602.050781 0)"/>
+     </g>
+    </g>
+   </g>
+  </g>
+  <g id="text_21">
+   <!-- kernels-community/megablocks vs Torch - Latency -->
+   <g style="fill: #1a1a2e" transform="translate(14.4 17.128438) scale(0.14 -0.14)">
+    <defs>
+     <path id="DejaVuSansMono-Bold-2d" d="M 941 2297 
+L 2913 2297 
+L 2913 1388 
+L 941 1388 
+L 941 2297 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-75" d="M 500 1222 
+L 500 3500 
+L 1416 3500 
+L 1416 1363 
+Q 1416 984 1522 821 
+Q 1628 659 1875 659 
+Q 2122 659 2261 878 
+Q 2400 1097 2400 1491 
+L 2400 3500 
+L 3316 3500 
+L 3316 0 
+L 2400 0 
+L 2400 519 
+Q 2303 228 2070 68 
+Q 1838 -91 1509 -91 
+Q 1009 -91 754 240 
+Q 500 572 500 1222 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-69" d="M 691 3500 
+L 2503 3500 
+L 2503 703 
+L 3641 703 
+L 3641 0 
+L 447 0 
+L 447 703 
+L 1588 703 
+L 1588 2797 
+L 691 2797 
+L 691 3500 
+z
+M 1588 5203 
+L 2503 5203 
+L 2503 4134 
+L 1588 4134 
+L 1588 5203 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-79" d="M 2222 -378 
+Q 2038 -872 1780 -1098 
+Q 1522 -1325 1153 -1325 
+L 397 -1325 
+L 397 -628 
+L 769 -628 
+Q 1050 -628 1181 -533 
+Q 1313 -438 1447 -91 
+L 1516 97 
+L 184 3500 
+L 1147 3500 
+L 1947 1228 
+L 2713 3500 
+L 3675 3500 
+L 2222 -378 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-2f" d="M 2809 4666 
+L 3500 4666 
+L 1044 -594 
+L 353 -594 
+L 2809 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-76" d="M 3603 3500 
+L 2491 0 
+L 1363 0 
+L 250 3500 
+L 1178 3500 
+L 1925 769 
+L 2675 3500 
+L 3603 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-54" d="M 2388 0 
+L 1466 0 
+L 1466 3859 
+L 281 3859 
+L 281 4666 
+L 3572 4666 
+L 3572 3859 
+L 2388 3859 
+L 2388 0 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-4c" d="M 703 0 
+L 703 4666 
+L 1625 4666 
+L 1625 813 
+L 3597 813 
+L 3597 0 
+L 703 0 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-Bold-6b"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-69" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2f" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(1505.126953 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1565.332031 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1625.537109 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1685.742188 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-76" transform="translate(1745.947266 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1806.152344 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1866.357422 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(1926.5625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1986.767578 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2046.972656 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2107.177734 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2167.382812 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2227.587891 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(2287.792969 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2347.998047 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-4c" transform="translate(2408.203125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(2468.408203 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(2528.613281 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(2588.818359 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(2649.023438 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2709.228516 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(2769.433594 0)"/>
+   </g>
+  </g>
+  <g id="text_22">
+   <!-- PyTorch 2.11.0+cu130 . CPU -->
+   <g style="fill: #6b7280" transform="translate(549.071875 13.411563) scale(0.1 -0.1)">
+    <defs>
+     <path id="DejaVuSansMono-50" d="M 1247 4147 
+L 1247 2394 
+L 1978 2394 
+Q 2416 2394 2661 2625 
+Q 2906 2856 2906 3272 
+Q 2906 3688 2662 3917 
+Q 2419 4147 1978 4147 
+L 1247 4147 
+z
+M 616 4666 
+L 1978 4666 
+Q 2759 4666 3162 4311 
+Q 3566 3956 3566 3272 
+Q 3566 2581 3164 2228 
+Q 2763 1875 1978 1875 
+L 1247 1875 
+L 1247 0 
+L 616 0 
+L 616 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-79" d="M 2681 1125 
+Q 2538 759 2316 163 
+Q 2006 -663 1900 -844 
+Q 1756 -1088 1540 -1209 
+Q 1325 -1331 1038 -1331 
+L 575 -1331 
+L 575 -850 
+L 916 -850 
+Q 1169 -850 1312 -703 
+Q 1456 -556 1678 56 
+L 325 3500 
+L 934 3500 
+L 1972 763 
+L 2994 3500 
+L 3603 3500 
+L 2681 1125 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2b" d="M 2188 3659 
+L 2188 2272 
+L 3578 2272 
+L 3578 1741 
+L 2188 1741 
+L 2188 353 
+L 1663 353 
+L 1663 1741 
+L 275 1741 
+L 275 2272 
+L 1663 2272 
+L 1663 3659 
+L 2188 3659 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-75" d="M 609 1325 
+L 609 3494 
+L 1184 3494 
+L 1184 1325 
+Q 1184 853 1351 631 
+Q 1519 409 1869 409 
+Q 2275 409 2490 695 
+Q 2706 981 2706 1516 
+L 2706 3494 
+L 3284 3494 
+L 3284 0 
+L 2706 0 
+L 2706 525 
+Q 2553 222 2289 65 
+Q 2025 -91 1672 -91 
+Q 1134 -91 871 261 
+Q 609 613 609 1325 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-43" d="M 3353 166 
+Q 3113 38 2859 -26 
+Q 2606 -91 2322 -91 
+Q 1425 -91 929 543 
+Q 434 1178 434 2328 
+Q 434 3472 932 4111 
+Q 1431 4750 2322 4750 
+Q 2606 4750 2859 4686 
+Q 3113 4622 3353 4494 
+L 3353 3847 
+Q 3122 4038 2856 4138 
+Q 2591 4238 2322 4238 
+Q 1706 4238 1400 3763 
+Q 1094 3288 1094 2328 
+Q 1094 1372 1400 897 
+Q 1706 422 2322 422 
+Q 2597 422 2861 522 
+Q 3125 622 3353 813 
+L 3353 166 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-55" d="M 459 1791 
+L 459 4666 
+L 1094 4666 
+L 1094 1503 
+Q 1094 1163 1112 1017 
+Q 1131 872 1178 794 
+Q 1278 609 1467 515 
+Q 1656 422 1925 422 
+Q 2197 422 2384 515 
+Q 2572 609 2675 794 
+Q 2722 872 2740 1015 
+Q 2759 1159 2759 1497 
+L 2759 4666 
+L 3391 4666 
+L 3391 1791 
+Q 3391 1075 3302 773 
+Q 3213 472 2994 275 
+Q 2788 91 2522 0 
+Q 2256 -91 1925 -91 
+Q 1597 -91 1331 0 
+Q 1066 91 856 275 
+Q 641 469 550 776 
+Q 459 1084 459 1791 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-50"/>
+    <use xlink:href="#DejaVuSansMono-79" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-54" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-72" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-68" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-32" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-2b" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-75" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-33" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-43" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-50" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-55" transform="translate(1505.126953 0)"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p695fbbea17">
+   <rect x="187.08" y="10.8" width="522.12" height="235.24"/>
+  </clipPath>
+ </defs>
+<g transform="translate(10,259.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
diff --git a/media/benches_light_throughput.svg b/media/benches_light_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..ec20fa1c7c3fe91b48fec8b2e644d7fc92f8fd34
--- /dev/null
+++ b/media/benches_light_throughput.svg
@@ -0,0 +1,2034 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns:xlink="http://www.w3.org/1999/xlink" width="720pt" height="288pt" viewBox="0 0 720 288" xmlns="http://www.w3.org/2000/svg" version="1.1">
+ <metadata>
+  <rdf:RDF xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+   <cc:Work>
+    <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
+    <dc:date>2026-03-27T20:23:32.037977</dc:date>
+    <dc:format>image/svg+xml</dc:format>
+    <dc:creator>
+     <cc:Agent>
+      <dc:title>Matplotlib v3.10.8, https://matplotlib.org/</dc:title>
+     </cc:Agent>
+    </dc:creator>
+   </cc:Work>
+  </rdf:RDF>
+ </metadata>
+ <defs>
+  <style type="text/css">*{stroke-linejoin: round; stroke-linecap: butt}</style>
+ </defs>
+ <g id="figure_1">
+  <g id="patch_1">
+   <path d="M 0 288 
+L 720 288 
+L 720 0 
+L 0 0 
+z
+" style="fill: #ffffff"/>
+  </g>
+  <g id="axes_1">
+   <g id="patch_2">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+L 709.2 10.8 
+L 187.08 10.8 
+z
+" style="fill: #ffffff"/>
+   </g>
+   <g id="patch_3">
+    <path d="M 187.08 78.657692 
+L 535.16 78.657692 
+L 535.16 60.562308 
+L 187.08 60.562308 
+z
+" clip-path="url(#p1c48eb0d56)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_4">
+    <path d="M 187.08 105.800769 
+L 513.690381 105.800769 
+L 513.690381 87.705385 
+L 187.08 87.705385 
+z
+" clip-path="url(#p1c48eb0d56)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_5">
+    <path d="M 187.08 169.134615 
+L 381.793331 169.134615 
+L 381.793331 151.039231 
+L 187.08 151.039231 
+z
+" clip-path="url(#p1c48eb0d56)" style="fill: #ff9d00; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="patch_6">
+    <path d="M 187.08 196.277692 
+L 254.984533 196.277692 
+L 254.984533 178.182308 
+L 187.08 178.182308 
+z
+" clip-path="url(#p1c48eb0d56)" style="fill: #6b7280; stroke: #ffffff; stroke-width: 0.5; stroke-linejoin: miter"/>
+   </g>
+   <g id="matplotlib.axis_1">
+    <g id="xtick_1">
+     <g id="line2d_1">
+      <defs>
+       <path id="m2a327edd65" d="M 0 0 
+L 0 3.5 
+" style="stroke: #6b7280; stroke-width: 0.8"/>
+      </defs>
+      <g>
+       <use xlink:href="#m2a327edd65" x="187.08" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_1">
+      <!-- 0 -->
+      <g style="fill: #6b7280" transform="translate(184.069844 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-30" d="M 1509 2344 
+Q 1509 2516 1629 2641 
+Q 1750 2766 1919 2766 
+Q 2094 2766 2219 2641 
+Q 2344 2516 2344 2344 
+Q 2344 2169 2220 2047 
+Q 2097 1925 1919 1925 
+Q 1744 1925 1626 2044 
+Q 1509 2163 1509 2344 
+z
+M 1925 4250 
+Q 1484 4250 1267 3775 
+Q 1050 3300 1050 2328 
+Q 1050 1359 1267 884 
+Q 1484 409 1925 409 
+Q 2369 409 2586 884 
+Q 2803 1359 2803 2328 
+Q 2803 3300 2586 3775 
+Q 2369 4250 1925 4250 
+z
+M 1925 4750 
+Q 2672 4750 3055 4137 
+Q 3438 3525 3438 2328 
+Q 3438 1134 3055 521 
+Q 2672 -91 1925 -91 
+Q 1178 -91 797 521 
+Q 416 1134 416 2328 
+Q 416 3525 797 4137 
+Q 1178 4750 1925 4750 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-30"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_2">
+     <g id="line2d_2">
+      <g>
+       <use xlink:href="#m2a327edd65" x="267.94873" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_2">
+      <!-- 20 -->
+      <g style="fill: #6b7280" transform="translate(261.928418 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-32" d="M 1166 531 
+L 3309 531 
+L 3309 0 
+L 475 0 
+L 475 531 
+Q 1059 1147 1496 1619 
+Q 1934 2091 2100 2284 
+Q 2413 2666 2522 2902 
+Q 2631 3138 2631 3384 
+Q 2631 3775 2401 3997 
+Q 2172 4219 1772 4219 
+Q 1488 4219 1175 4116 
+Q 863 4013 513 3803 
+L 513 4441 
+Q 834 4594 1145 4672 
+Q 1456 4750 1759 4750 
+Q 2444 4750 2861 4386 
+Q 3278 4022 3278 3431 
+Q 3278 3131 3139 2831 
+Q 3000 2531 2688 2169 
+Q 2513 1966 2180 1606 
+Q 1847 1247 1166 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-32"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_3">
+     <g id="line2d_3">
+      <g>
+       <use xlink:href="#m2a327edd65" x="348.81746" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_3">
+      <!-- 40 -->
+      <g style="fill: #6b7280" transform="translate(342.797148 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-34" d="M 2297 4091 
+L 825 1625 
+L 2297 1625 
+L 2297 4091 
+z
+M 2194 4666 
+L 2925 4666 
+L 2925 1625 
+L 3547 1625 
+L 3547 1113 
+L 2925 1113 
+L 2925 0 
+L 2297 0 
+L 2297 1113 
+L 319 1113 
+L 319 1709 
+L 2194 4666 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-34"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_4">
+     <g id="line2d_4">
+      <g>
+       <use xlink:href="#m2a327edd65" x="429.686191" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_4">
+      <!-- 60 -->
+      <g style="fill: #6b7280" transform="translate(423.665878 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-36" d="M 3097 4563 
+L 3097 3981 
+Q 2900 4097 2678 4158 
+Q 2456 4219 2216 4219 
+Q 1616 4219 1306 3767 
+Q 997 3316 997 2438 
+Q 1147 2750 1412 2917 
+Q 1678 3084 2022 3084 
+Q 2697 3084 3067 2670 
+Q 3438 2256 3438 1497 
+Q 3438 741 3056 325 
+Q 2675 -91 1984 -91 
+Q 1172 -91 794 492 
+Q 416 1075 416 2328 
+Q 416 3509 870 4129 
+Q 1325 4750 2188 4750 
+Q 2419 4750 2650 4701 
+Q 2881 4653 3097 4563 
+z
+M 1972 2591 
+Q 1569 2591 1337 2300 
+Q 1106 2009 1106 1497 
+Q 1106 984 1337 693 
+Q 1569 403 1972 403 
+Q 2391 403 2603 679 
+Q 2816 956 2816 1497 
+Q 2816 2041 2603 2316 
+Q 2391 2591 1972 2591 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-36"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_5">
+     <g id="line2d_5">
+      <g>
+       <use xlink:href="#m2a327edd65" x="510.554921" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_5">
+      <!-- 80 -->
+      <g style="fill: #6b7280" transform="translate(504.534608 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-38" d="M 1925 2216 
+Q 1503 2216 1273 1980 
+Q 1044 1744 1044 1313 
+Q 1044 881 1276 642 
+Q 1509 403 1925 403 
+Q 2350 403 2579 639 
+Q 2809 875 2809 1313 
+Q 2809 1741 2576 1978 
+Q 2344 2216 1925 2216 
+z
+M 1375 2478 
+Q 972 2581 745 2862 
+Q 519 3144 519 3541 
+Q 519 4097 897 4423 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4423 
+Q 3334 4097 3334 3541 
+Q 3334 3144 3107 2862 
+Q 2881 2581 2478 2478 
+Q 2947 2375 3195 2062 
+Q 3444 1750 3444 1253 
+Q 3444 622 3041 265 
+Q 2638 -91 1925 -91 
+Q 1213 -91 811 264 
+Q 409 619 409 1247 
+Q 409 1747 657 2061 
+Q 906 2375 1375 2478 
+z
+M 1147 3481 
+Q 1147 3106 1347 2909 
+Q 1547 2713 1925 2713 
+Q 2306 2713 2506 2909 
+Q 2706 3106 2706 3481 
+Q 2706 3863 2507 4063 
+Q 2309 4263 1925 4263 
+Q 1547 4263 1347 4061 
+Q 1147 3859 1147 3481 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-38"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_6">
+     <g id="line2d_6">
+      <g>
+       <use xlink:href="#m2a327edd65" x="591.423651" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_6">
+      <!-- 100 -->
+      <g style="fill: #6b7280" transform="translate(582.393182 260.691562) scale(0.1 -0.1)">
+       <defs>
+        <path id="DejaVuSansMono-31" d="M 844 531 
+L 1825 531 
+L 1825 4097 
+L 769 3859 
+L 769 4434 
+L 1819 4666 
+L 2450 4666 
+L 2450 531 
+L 3419 531 
+L 3419 0 
+L 844 0 
+L 844 531 
+z
+" transform="scale(0.015625)"/>
+       </defs>
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(60.205078 0)"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(120.410156 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="xtick_7">
+     <g id="line2d_7">
+      <g>
+       <use xlink:href="#m2a327edd65" x="672.292381" y="246.04" style="fill: #6b7280; stroke: #6b7280; stroke-width: 0.8"/>
+      </g>
+     </g>
+     <g id="text_7">
+      <!-- 120 -->
+      <g style="fill: #6b7280" transform="translate(663.261913 260.691562) scale(0.1 -0.1)">
+       <use xlink:href="#DejaVuSansMono-31"/>
+       <use xlink:href="#DejaVuSansMono-32" transform="translate(60.205078 0)"/>
+       <use xlink:href="#DejaVuSansMono-30" transform="translate(120.410156 0)"/>
+      </g>
+     </g>
+    </g>
+    <g id="text_8">
+     <!-- Operations per second  -&gt;  longer is better -->
+     <g style="fill: #1a1a2e" transform="translate(318.703281 274.422813) scale(0.1 -0.1)">
+      <defs>
+       <path id="DejaVuSansMono-4f" d="M 2828 2328 
+Q 2828 3356 2617 3797 
+Q 2406 4238 1925 4238 
+Q 1447 4238 1236 3797 
+Q 1025 3356 1025 2328 
+Q 1025 1303 1236 862 
+Q 1447 422 1925 422 
+Q 2406 422 2617 861 
+Q 2828 1300 2828 2328 
+z
+M 3488 2328 
+Q 3488 1109 3102 509 
+Q 2716 -91 1925 -91 
+Q 1134 -91 750 506 
+Q 366 1103 366 2328 
+Q 366 3550 752 4150 
+Q 1138 4750 1925 4750 
+Q 2716 4750 3102 4150 
+Q 3488 3550 3488 2328 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-70" d="M 1172 441 
+L 1172 -1331 
+L 594 -1331 
+L 594 3500 
+L 1172 3500 
+L 1172 3053 
+Q 1316 3313 1555 3448 
+Q 1794 3584 2106 3584 
+Q 2741 3584 3102 3093 
+Q 3463 2603 3463 1734 
+Q 3463 881 3100 395 
+Q 2738 -91 2106 -91 
+Q 1788 -91 1548 45 
+Q 1309 181 1172 441 
+z
+M 2859 1747 
+Q 2859 2416 2648 2756 
+Q 2438 3097 2022 3097 
+Q 1603 3097 1387 2755 
+Q 1172 2413 1172 1747 
+Q 1172 1084 1387 740 
+Q 1603 397 2022 397 
+Q 2438 397 2648 737 
+Q 2859 1078 2859 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-65" d="M 3475 1894 
+L 3475 1613 
+L 984 1613 
+L 984 1594 
+Q 984 1022 1282 709 
+Q 1581 397 2125 397 
+Q 2400 397 2700 484 
+Q 3000 572 3341 750 
+L 3341 178 
+Q 3013 44 2708 -23 
+Q 2403 -91 2119 -91 
+Q 1303 -91 843 398 
+Q 384 888 384 1747 
+Q 384 2584 834 3084 
+Q 1284 3584 2034 3584 
+Q 2703 3584 3089 3131 
+Q 3475 2678 3475 1894 
+z
+M 2900 2063 
+Q 2888 2569 2661 2833 
+Q 2434 3097 2009 3097 
+Q 1594 3097 1325 2822 
+Q 1056 2547 1006 2059 
+L 2900 2063 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-72" d="M 3609 2778 
+Q 3425 2922 3234 2987 
+Q 3044 3053 2816 3053 
+Q 2278 3053 1993 2715 
+Q 1709 2378 1709 1741 
+L 1709 0 
+L 1131 0 
+L 1131 3500 
+L 1709 3500 
+L 1709 2816 
+Q 1853 3188 2151 3386 
+Q 2450 3584 2859 3584 
+Q 3072 3584 3256 3531 
+Q 3441 3478 3609 3366 
+L 3609 2778 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-61" d="M 2194 1759 
+L 2003 1759 
+Q 1500 1759 1245 1582 
+Q 991 1406 991 1056 
+Q 991 741 1181 566 
+Q 1372 391 1709 391 
+Q 2184 391 2456 720 
+Q 2728 1050 2731 1631 
+L 2731 1759 
+L 2194 1759 
+z
+M 3309 1997 
+L 3309 0 
+L 2731 0 
+L 2731 519 
+Q 2547 206 2267 57 
+Q 1988 -91 1588 -91 
+Q 1053 -91 734 211 
+Q 416 513 416 1019 
+Q 416 1603 808 1906 
+Q 1200 2209 1959 2209 
+L 2731 2209 
+L 2731 2300 
+Q 2728 2719 2518 2908 
+Q 2309 3097 1850 3097 
+Q 1556 3097 1256 3012 
+Q 956 2928 672 2766 
+L 672 3341 
+Q 991 3463 1283 3523 
+Q 1575 3584 1850 3584 
+Q 2284 3584 2592 3456 
+Q 2900 3328 3091 3072 
+Q 3209 2916 3259 2686 
+Q 3309 2456 3309 1997 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-74" d="M 1919 4494 
+L 1919 3500 
+L 3225 3500 
+L 3225 3053 
+L 1919 3053 
+L 1919 1153 
+Q 1919 766 2066 612 
+Q 2213 459 2578 459 
+L 3225 459 
+L 3225 0 
+L 2522 0 
+Q 1875 0 1609 259 
+Q 1344 519 1344 1153 
+L 1344 3053 
+L 409 3053 
+L 409 3500 
+L 1344 3500 
+L 1344 4494 
+L 1919 4494 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-69" d="M 800 3500 
+L 2272 3500 
+L 2272 447 
+L 3413 447 
+L 3413 0 
+L 556 0 
+L 556 447 
+L 1697 447 
+L 1697 3053 
+L 800 3053 
+L 800 3500 
+z
+M 1697 4863 
+L 2272 4863 
+L 2272 4134 
+L 1697 4134 
+L 1697 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6f" d="M 1925 3097 
+Q 1488 3097 1263 2756 
+Q 1038 2416 1038 1747 
+Q 1038 1081 1263 739 
+Q 1488 397 1925 397 
+Q 2366 397 2591 739 
+Q 2816 1081 2816 1747 
+Q 2816 2416 2591 2756 
+Q 2366 3097 1925 3097 
+z
+M 1925 3584 
+Q 2653 3584 3039 3112 
+Q 3425 2641 3425 1747 
+Q 3425 850 3040 379 
+Q 2656 -91 1925 -91 
+Q 1197 -91 812 379 
+Q 428 850 428 1747 
+Q 428 2641 812 3112 
+Q 1197 3584 1925 3584 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6e" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 3500 
+L 1184 3500 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-73" d="M 3041 3378 
+L 3041 2816 
+Q 2794 2959 2544 3031 
+Q 2294 3103 2034 3103 
+Q 1644 3103 1451 2976 
+Q 1259 2850 1259 2591 
+Q 1259 2356 1403 2240 
+Q 1547 2125 2119 2016 
+L 2350 1972 
+Q 2778 1891 2998 1647 
+Q 3219 1403 3219 1013 
+Q 3219 494 2850 201 
+Q 2481 -91 1825 -91 
+Q 1566 -91 1281 -36 
+Q 997 19 666 128 
+L 666 722 
+Q 988 556 1281 473 
+Q 1575 391 1838 391 
+Q 2219 391 2428 545 
+Q 2638 700 2638 978 
+Q 2638 1378 1872 1531 
+L 1847 1538 
+L 1631 1581 
+Q 1134 1678 906 1908 
+Q 678 2138 678 2534 
+Q 678 3038 1018 3311 
+Q 1359 3584 1991 3584 
+Q 2272 3584 2531 3532 
+Q 2791 3481 3041 3378 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-20" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-63" d="M 3316 178 
+Q 3084 44 2839 -23 
+Q 2594 -91 2338 -91 
+Q 1525 -91 1067 396 
+Q 609 884 609 1747 
+Q 609 2609 1067 3096 
+Q 1525 3584 2338 3584 
+Q 2591 3584 2831 3518 
+Q 3072 3453 3316 3316 
+L 3316 2713 
+Q 3088 2916 2858 3006 
+Q 2628 3097 2338 3097 
+Q 1797 3097 1506 2747 
+Q 1216 2397 1216 1747 
+Q 1216 1100 1508 748 
+Q 1800 397 2338 397 
+Q 2638 397 2875 489 
+Q 3113 581 3316 775 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-64" d="M 2681 3053 
+L 2681 4863 
+L 3256 4863 
+L 3256 0 
+L 2681 0 
+L 2681 441 
+Q 2538 181 2298 45 
+Q 2059 -91 1747 -91 
+Q 1113 -91 748 401 
+Q 384 894 384 1759 
+Q 384 2613 750 3098 
+Q 1116 3584 1747 3584 
+Q 2063 3584 2303 3448 
+Q 2544 3313 2681 3053 
+z
+M 991 1747 
+Q 991 1078 1203 737 
+Q 1416 397 1831 397 
+Q 2247 397 2464 740 
+Q 2681 1084 2681 1747 
+Q 2681 2413 2464 2755 
+Q 2247 3097 1831 3097 
+Q 1416 3097 1203 2756 
+Q 991 2416 991 1747 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-2d" d="M 1113 2009 
+L 2741 2009 
+L 2741 1497 
+L 1113 1497 
+L 1113 2009 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-3e" d="M 275 3003 
+L 275 3572 
+L 3578 2266 
+L 3578 1747 
+L 275 441 
+L 275 1013 
+L 2931 2003 
+L 275 3003 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-6c" d="M 1997 1269 
+Q 1997 881 2139 684 
+Q 2281 488 2559 488 
+L 3231 488 
+L 3231 0 
+L 2503 0 
+Q 1988 0 1705 331 
+Q 1422 663 1422 1269 
+L 1422 4447 
+L 500 4447 
+L 500 4897 
+L 1997 4897 
+L 1997 1269 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-67" d="M 2681 1778 
+Q 2681 2425 2470 2761 
+Q 2259 3097 1856 3097 
+Q 1434 3097 1212 2761 
+Q 991 2425 991 1778 
+Q 991 1131 1214 792 
+Q 1438 453 1863 453 
+Q 2259 453 2470 793 
+Q 2681 1134 2681 1778 
+z
+M 3256 225 
+Q 3256 -563 2884 -969 
+Q 2513 -1375 1791 -1375 
+Q 1553 -1375 1293 -1331 
+Q 1034 -1288 775 -1203 
+L 775 -634 
+Q 1081 -778 1331 -847 
+Q 1581 -916 1791 -916 
+Q 2256 -916 2468 -662 
+Q 2681 -409 2681 141 
+L 2681 166 
+L 2681 556 
+Q 2544 263 2306 119 
+Q 2069 -25 1728 -25 
+Q 1116 -25 750 465 
+Q 384 956 384 1778 
+Q 384 2603 750 3093 
+Q 1116 3584 1728 3584 
+Q 2066 3584 2300 3450 
+Q 2534 3316 2681 3034 
+L 2681 3488 
+L 3256 3488 
+L 3256 225 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-62" d="M 2869 1747 
+Q 2869 2416 2656 2756 
+Q 2444 3097 2028 3097 
+Q 1609 3097 1393 2755 
+Q 1178 2413 1178 1747 
+Q 1178 1084 1393 740 
+Q 1609 397 2028 397 
+Q 2444 397 2656 737 
+Q 2869 1078 2869 1747 
+z
+M 1178 3053 
+Q 1316 3309 1558 3446 
+Q 1800 3584 2119 3584 
+Q 2750 3584 3112 3098 
+Q 3475 2613 3475 1759 
+Q 3475 894 3111 401 
+Q 2747 -91 2113 -91 
+Q 1800 -91 1561 45 
+Q 1322 181 1178 441 
+L 1178 0 
+L 603 0 
+L 603 4863 
+L 1178 4863 
+L 1178 3053 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4f"/>
+      <use xlink:href="#DejaVuSansMono-70" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-61" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(602.050781 0)"/>
+      <use xlink:href="#DejaVuSansMono-70" transform="translate(662.255859 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(722.460938 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(782.666016 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(842.871094 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(903.076172 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(963.28125 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(1023.486328 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1083.691406 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(1143.896484 0)"/>
+      <use xlink:href="#DejaVuSansMono-64" transform="translate(1204.101562 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1264.306641 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+      <use xlink:href="#DejaVuSansMono-2d" transform="translate(1384.716797 0)"/>
+      <use xlink:href="#DejaVuSansMono-3e" transform="translate(1444.921875 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1505.126953 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1565.332031 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(1625.537109 0)"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(1685.742188 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(1745.947266 0)"/>
+      <use xlink:href="#DejaVuSansMono-67" transform="translate(1806.152344 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(1866.357422 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(1926.5625 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(1986.767578 0)"/>
+      <use xlink:href="#DejaVuSansMono-69" transform="translate(2046.972656 0)"/>
+      <use xlink:href="#DejaVuSansMono-73" transform="translate(2107.177734 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(2167.382812 0)"/>
+      <use xlink:href="#DejaVuSansMono-62" transform="translate(2227.587891 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(2287.792969 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(2347.998047 0)"/>
+      <use xlink:href="#DejaVuSansMono-74" transform="translate(2408.203125 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(2468.408203 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(2528.613281 0)"/>
+     </g>
+    </g>
+   </g>
+   <g id="matplotlib.axis_2"/>
+   <g id="patch_7">
+    <path d="M 187.08 246.04 
+L 709.2 246.04 
+" style="fill: none; stroke: #6b7280; stroke-width: 0.5; stroke-linejoin: miter; stroke-linecap: square"/>
+   </g>
+   <g id="text_9">
+    <!-- 86 ops/s -->
+    <g style="fill: #1a1a2e" transform="translate(542.1216 72.117344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-2f" d="M 2778 4666 
+L 3372 4666 
+L 916 -594 
+L 319 -594 
+L 2778 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-38"/>
+     <use xlink:href="#DejaVuSansMono-36" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_10">
+    <!-- 81 ops/s -->
+    <g style="fill: #1a1a2e" transform="translate(520.651981 99.260421) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-38"/>
+     <use xlink:href="#DejaVuSansMono-31" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_11">
+    <!-- MegaBlocksMoeBenchmark.base -->
+    <g style="fill: #1a1a2e" transform="translate(17.569963 85.940913) scale(0.1 -0.1)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-4d" d="M 269 4666 
+L 1369 4666 
+L 1925 2619 
+L 2478 4666 
+L 3584 4666 
+L 3584 0 
+L 2791 0 
+L 2791 3738 
+L 2297 1697 
+L 1563 1697 
+L 1063 3738 
+L 1063 0 
+L 269 0 
+L 269 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-65" d="M 3444 172 
+Q 3125 41 2794 -25 
+Q 2463 -91 2094 -91 
+Q 1216 -91 752 379 
+Q 288 850 288 1734 
+Q 288 2591 734 3087 
+Q 1181 3584 1953 3584 
+Q 2731 3584 3161 3123 
+Q 3591 2663 3591 1825 
+L 3591 1453 
+L 1219 1453 
+Q 1222 1041 1462 837 
+Q 1703 634 2181 634 
+Q 2497 634 2803 725 
+Q 3109 816 3444 1013 
+L 3444 172 
+z
+M 2669 2141 
+Q 2663 2503 2483 2689 
+Q 2303 2875 1953 2875 
+Q 1638 2875 1450 2683 
+Q 1263 2491 1228 2138 
+L 2669 2141 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-67" d="M 2509 1806 
+Q 2509 2275 2331 2558 
+Q 2153 2841 1863 2841 
+Q 1575 2841 1398 2559 
+Q 1222 2278 1222 1806 
+Q 1222 1331 1398 1050 
+Q 1575 769 1863 769 
+Q 2153 769 2331 1051 
+Q 2509 1334 2509 1806 
+z
+M 3425 263 
+Q 3425 -578 3045 -951 
+Q 2666 -1325 1813 -1325 
+Q 1525 -1325 1244 -1283 
+Q 963 -1241 672 -1153 
+L 672 -313 
+Q 931 -456 1200 -525 
+Q 1469 -594 1753 -594 
+Q 2141 -594 2325 -411 
+Q 2509 -228 2509 159 
+L 2509 538 
+Q 2375 288 2153 166 
+Q 1931 44 1613 44 
+Q 1013 44 659 512 
+Q 306 981 306 1784 
+Q 306 2616 659 3103 
+Q 1013 3591 1606 3591 
+Q 1906 3591 2140 3450 
+Q 2375 3309 2509 3053 
+L 2509 3500 
+L 3425 3500 
+L 3425 263 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-61" d="M 2188 1644 
+Q 1675 1644 1472 1512 
+Q 1269 1381 1269 1063 
+Q 1269 825 1409 684 
+Q 1550 544 1791 544 
+Q 2153 544 2353 817 
+Q 2553 1091 2553 1581 
+L 2553 1644 
+L 2188 1644 
+z
+M 3463 1997 
+L 3463 0 
+L 2553 0 
+L 2553 391 
+Q 2388 159 2128 34 
+Q 1869 -91 1556 -91 
+Q 959 -91 626 225 
+Q 294 541 294 1106 
+Q 294 1719 691 2011 
+Q 1088 2303 1919 2303 
+L 2553 2303 
+L 2553 2456 
+Q 2553 2678 2392 2792 
+Q 2231 2906 1919 2906 
+Q 1591 2906 1283 2823 
+Q 975 2741 641 2559 
+L 641 3341 
+Q 944 3466 1256 3525 
+Q 1569 3584 1919 3584 
+Q 2772 3584 3117 3237 
+Q 3463 2891 3463 1997 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-42" d="M 1281 2119 
+L 1281 738 
+L 1894 738 
+Q 2334 738 2511 888 
+Q 2688 1038 2688 1409 
+Q 2688 1788 2503 1953 
+Q 2319 2119 1894 2119 
+L 1281 2119 
+z
+M 1281 3938 
+L 1281 2853 
+L 1894 2853 
+Q 2244 2853 2392 2978 
+Q 2541 3103 2541 3391 
+Q 2541 3678 2389 3808 
+Q 2238 3938 1894 3938 
+L 1281 3938 
+z
+M 391 4672 
+L 1894 4672 
+Q 2672 4672 3064 4376 
+Q 3456 4081 3456 3494 
+Q 3456 3044 3225 2791 
+Q 2994 2538 2547 2497 
+Q 3081 2444 3351 2137 
+Q 3622 1831 3622 1281 
+Q 3622 606 3215 303 
+Q 2809 0 1894 0 
+L 391 0 
+L 391 4672 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6c" d="M 1209 1459 
+L 1209 4159 
+L 281 4159 
+L 281 4863 
+L 2125 4863 
+L 2125 1459 
+Q 2125 1047 2253 875 
+Q 2381 703 2688 703 
+L 3419 703 
+L 3419 0 
+L 2431 0 
+Q 1778 0 1493 337 
+Q 1209 675 1209 1459 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6f" d="M 1925 2841 
+Q 1597 2841 1409 2552 
+Q 1222 2263 1222 1747 
+Q 1222 1231 1409 942 
+Q 1597 653 1925 653 
+Q 2256 653 2443 942 
+Q 2631 1231 2631 1747 
+Q 2631 2263 2443 2552 
+Q 2256 2841 1925 2841 
+z
+M 306 1747 
+Q 306 2594 745 3089 
+Q 1184 3584 1925 3584 
+Q 2669 3584 3108 3089 
+Q 3547 2594 3547 1747 
+Q 3547 900 3108 404 
+Q 2669 -91 1925 -91 
+Q 1184 -91 745 404 
+Q 306 900 306 1747 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-63" d="M 3316 178 
+Q 3084 44 2818 -23 
+Q 2553 -91 2247 -91 
+Q 1438 -91 981 396 
+Q 525 884 525 1747 
+Q 525 2613 984 3102 
+Q 1444 3591 2253 3591 
+Q 2534 3591 2795 3525 
+Q 3056 3459 3316 3322 
+L 3316 2484 
+Q 3116 2656 2877 2748 
+Q 2638 2841 2381 2841 
+Q 1934 2841 1693 2556 
+Q 1453 2272 1453 1747 
+Q 1453 1222 1693 940 
+Q 1934 659 2381 659 
+Q 2647 659 2878 748 
+Q 3109 838 3316 1019 
+L 3316 178 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6b" d="M 544 4863 
+L 1459 4863 
+L 1459 2309 
+L 2559 3500 
+L 3669 3500 
+L 2344 2194 
+L 3744 0 
+L 2731 0 
+L 1772 1638 
+L 1459 1338 
+L 1459 0 
+L 544 0 
+L 544 4863 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-73" d="M 3097 3391 
+L 3097 2591 
+Q 2844 2753 2570 2836 
+Q 2297 2919 2022 2919 
+Q 1716 2919 1559 2830 
+Q 1403 2741 1403 2566 
+Q 1403 2316 2072 2159 
+L 2106 2150 
+L 2369 2088 
+Q 2869 1969 3101 1705 
+Q 3334 1441 3334 991 
+Q 3334 450 2979 179 
+Q 2625 -91 1913 -91 
+Q 1597 -91 1265 -36 
+Q 934 19 594 128 
+L 594 928 
+Q 897 756 1211 665 
+Q 1525 575 1819 575 
+Q 2141 575 2306 669 
+Q 2472 763 2472 941 
+Q 2472 1116 2355 1209 
+Q 2238 1303 1797 1409 
+L 1544 1466 
+Q 1019 1584 778 1837 
+Q 538 2091 538 2516 
+Q 538 3022 903 3303 
+Q 1269 3584 1931 3584 
+Q 2228 3584 2523 3536 
+Q 2819 3488 3097 3391 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6e" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2513 2330 2677 
+Q 2222 2841 1978 2841 
+Q 1731 2841 1589 2619 
+Q 1447 2397 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 3500 
+L 1447 3500 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-68" d="M 3347 2272 
+L 3347 0 
+L 2438 0 
+L 2438 2131 
+Q 2438 2509 2330 2671 
+Q 2222 2834 1978 2834 
+Q 1728 2834 1587 2614 
+Q 1447 2394 1447 2003 
+L 1447 0 
+L 538 0 
+L 538 4863 
+L 1447 4863 
+L 1447 2975 
+Q 1544 3266 1778 3425 
+Q 2013 3584 2344 3584 
+Q 2841 3584 3094 3254 
+Q 3347 2925 3347 2272 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-6d" d="M 2156 3150 
+Q 2259 3372 2418 3478 
+Q 2578 3584 2809 3584 
+Q 3263 3584 3436 3271 
+Q 3609 2959 3609 1972 
+L 3609 0 
+L 2859 0 
+L 2859 2247 
+Q 2859 2638 2800 2769 
+Q 2741 2900 2584 2900 
+Q 2428 2900 2365 2765 
+Q 2303 2631 2303 2247 
+L 2303 0 
+L 1563 0 
+L 1563 2247 
+Q 1563 2631 1500 2765 
+Q 1438 2900 1281 2900 
+Q 1125 2900 1065 2769 
+Q 1006 2638 1006 2247 
+L 1006 0 
+L 256 0 
+L 256 3500 
+L 922 3500 
+L 922 3138 
+Q 1000 3344 1172 3464 
+Q 1344 3584 1556 3584 
+Q 1769 3584 1944 3457 
+Q 2119 3331 2156 3150 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-72" d="M 3597 2534 
+Q 3447 2672 3245 2740 
+Q 3044 2809 2803 2809 
+Q 2513 2809 2295 2707 
+Q 2078 2606 1959 2413 
+Q 1884 2294 1854 2125 
+Q 1825 1956 1825 1613 
+L 1825 0 
+L 909 0 
+L 909 3500 
+L 1825 3500 
+L 1825 2956 
+Q 1959 3256 2237 3420 
+Q 2516 3584 2888 3584 
+Q 3075 3584 3254 3539 
+Q 3434 3494 3597 3406 
+L 3597 2534 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-2e" d="M 1403 1147 
+L 2444 1147 
+L 2444 0 
+L 1403 0 
+L 1403 1147 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-62" d="M 2656 1741 
+Q 2656 2247 2487 2534 
+Q 2319 2822 2022 2822 
+Q 1725 2822 1553 2534 
+Q 1381 2247 1381 1741 
+Q 1381 1234 1553 946 
+Q 1725 659 2022 659 
+Q 2319 659 2487 946 
+Q 2656 1234 2656 1741 
+z
+M 1381 3003 
+Q 1550 3294 1773 3439 
+Q 1997 3584 2278 3584 
+Q 2900 3584 3236 3109 
+Q 3572 2634 3572 1747 
+Q 3572 872 3240 390 
+Q 2909 -91 2309 -91 
+Q 1994 -91 1759 62 
+Q 1525 216 1381 519 
+L 1381 0 
+L 469 0 
+L 469 4863 
+L 1381 4863 
+L 1381 3003 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1565.332031 0)"/>
+    </g>
+   </g>
+   <g id="text_12">
+    <!--   1.07x faster -->
+    <g style="fill: #ff9d00" transform="translate(587.372 85.664976) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-20" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-31" d="M 588 813 
+L 1619 813 
+L 1619 3841 
+L 659 3603 
+L 659 4434 
+L 1625 4666 
+L 2516 4666 
+L 2516 813 
+L 3547 813 
+L 3547 0 
+L 588 0 
+L 588 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-30" d="M 1538 2328 
+Q 1538 2494 1650 2606 
+Q 1763 2719 1925 2719 
+Q 2091 2719 2203 2606 
+Q 2316 2494 2316 2328 
+Q 2316 2166 2203 2053 
+Q 2091 1941 1925 1941 
+Q 1763 1941 1650 2052 
+Q 1538 2163 1538 2328 
+z
+M 1925 3969 
+Q 1606 3969 1459 3578 
+Q 1313 3188 1313 2328 
+Q 1313 1472 1459 1081 
+Q 1606 691 1925 691 
+Q 2247 691 2394 1081 
+Q 2541 1472 2541 2328 
+Q 2541 3188 2394 3578 
+Q 2247 3969 1925 3969 
+z
+M 384 2328 
+Q 384 3541 768 4145 
+Q 1153 4750 1925 4750 
+Q 2700 4750 3084 4147 
+Q 3469 3544 3469 2328 
+Q 3469 1116 3084 512 
+Q 2700 -91 1925 -91 
+Q 1153 -91 768 514 
+Q 384 1119 384 2328 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-37" d="M 422 4666 
+L 3372 4666 
+L 3372 4013 
+L 1791 0 
+L 850 0 
+L 2369 3853 
+L 422 3853 
+L 422 4666 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-78" d="M 3578 3500 
+L 2400 1825 
+L 3681 0 
+L 2613 0 
+L 1925 1178 
+L 1241 0 
+L 172 0 
+L 1466 1825 
+L 275 3500 
+L 1344 3500 
+L 1925 2456 
+L 2509 3500 
+L 3578 3500 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-66" d="M 2309 3806 
+L 2309 3500 
+L 3384 3500 
+L 3384 2797 
+L 2309 2797 
+L 2309 0 
+L 1394 0 
+L 1394 2797 
+L 544 2797 
+L 544 3500 
+L 1394 3500 
+L 1394 3744 
+Q 1394 4375 1656 4619 
+Q 1919 4863 2631 4863 
+L 3384 4863 
+L 3384 4159 
+L 2669 4159 
+Q 2463 4159 2389 4084 
+Q 2316 4009 2309 3806 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-74" d="M 2156 4494 
+L 2156 3500 
+L 3353 3500 
+L 3353 2797 
+L 2156 2797 
+L 2156 1141 
+Q 2156 906 2270 804 
+Q 2384 703 2650 703 
+L 3353 703 
+L 3353 0 
+L 2584 0 
+Q 1797 0 1519 251 
+Q 1241 503 1241 1184 
+L 1241 2797 
+L 347 2797 
+L 347 3500 
+L 1241 3500 
+L 1241 4494 
+L 2156 4494 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-31" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-30" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="text_13">
+    <!-- 48 ops/s -->
+    <g style="fill: #1a1a2e" transform="translate(388.754931 162.594267) scale(0.09 -0.09)">
+     <use xlink:href="#DejaVuSansMono-34"/>
+     <use xlink:href="#DejaVuSansMono-38" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_14">
+    <!-- 17 ops/s -->
+    <g style="fill: #1a1a2e" transform="translate(261.946133 189.737344) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-37" d="M 434 4666 
+L 3372 4666 
+L 3372 4397 
+L 1703 0 
+L 1044 0 
+L 2669 4134 
+L 434 4134 
+L 434 4666 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-31"/>
+     <use xlink:href="#DejaVuSansMono-37" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-20" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-70" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-2f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-73" transform="translate(421.435547 0)"/>
+    </g>
+   </g>
+   <g id="text_15">
+    <!-- MegaBlocksMoeBenchmark.large -->
+    <g style="fill: #1a1a2e" transform="translate(11.54965 176.417837) scale(0.1 -0.1)">
+     <use xlink:href="#DejaVuSansMono-Bold-4d"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-4d" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-42" transform="translate(782.666016 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(842.871094 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(903.076172 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(963.28125 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(1023.486328 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1143.896484 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1204.101562 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1264.306641 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(1324.511719 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1444.921875 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(1505.126953 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1565.332031 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1625.537109 0)"/>
+    </g>
+   </g>
+   <g id="text_16">
+    <!--   2.87x faster -->
+    <g style="fill: #ff9d00" transform="translate(434.005331 176.141899) scale(0.09 -0.09)">
+     <defs>
+      <path id="DejaVuSansMono-Bold-32" d="M 1356 813 
+L 3322 813 
+L 3322 0 
+L 359 0 
+L 359 788 
+L 859 1319 
+Q 1750 2266 1941 2484 
+Q 2175 2753 2278 2961 
+Q 2381 3169 2381 3372 
+Q 2381 3684 2192 3854 
+Q 2003 4025 1656 4025 
+Q 1409 4025 1101 3926 
+Q 794 3828 459 3641 
+L 459 4500 
+Q 794 4622 1114 4686 
+Q 1434 4750 1728 4750 
+Q 2469 4750 2892 4404 
+Q 3316 4059 3316 3463 
+Q 3316 3188 3223 2947 
+Q 3131 2706 2906 2413 
+Q 2741 2200 1997 1456 
+Q 1594 1053 1356 813 
+z
+" transform="scale(0.015625)"/>
+      <path id="DejaVuSansMono-Bold-38" d="M 1925 2081 
+Q 1616 2081 1425 1886 
+Q 1234 1691 1234 1369 
+Q 1234 1047 1425 850 
+Q 1616 653 1925 653 
+Q 2234 653 2426 853 
+Q 2619 1053 2619 1369 
+Q 2619 1691 2428 1886 
+Q 2238 2081 1925 2081 
+z
+M 1241 2472 
+Q 888 2594 703 2855 
+Q 519 3116 519 3494 
+Q 519 4075 897 4412 
+Q 1275 4750 1925 4750 
+Q 2578 4750 2956 4412 
+Q 3334 4075 3334 3494 
+Q 3334 3119 3153 2858 
+Q 2972 2597 2625 2472 
+Q 3013 2353 3231 2047 
+Q 3450 1741 3450 1313 
+Q 3450 641 3053 275 
+Q 2656 -91 1925 -91 
+Q 1197 -91 800 275 
+Q 403 641 403 1313 
+Q 403 1744 625 2050 
+Q 847 2356 1241 2472 
+z
+M 1338 3419 
+Q 1338 3144 1495 2983 
+Q 1653 2822 1925 2822 
+Q 2200 2822 2358 2983 
+Q 2516 3144 2516 3419 
+Q 2516 3691 2358 3848 
+Q 2200 4006 1925 4006 
+Q 1656 4006 1497 3847 
+Q 1338 3688 1338 3419 
+z
+" transform="scale(0.015625)"/>
+     </defs>
+     <use xlink:href="#DejaVuSansMono-Bold-20"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(60.205078 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-32" transform="translate(120.410156 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-2e" transform="translate(180.615234 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-38" transform="translate(240.820312 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-37" transform="translate(301.025391 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-78" transform="translate(361.230469 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(421.435547 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-66" transform="translate(481.640625 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(541.845703 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(602.050781 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(662.255859 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(722.460938 0)"/>
+     <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(782.666016 0)"/>
+    </g>
+   </g>
+   <g id="legend_1">
+    <g id="patch_8">
+     <path d="M 614.498906 56.27825 
+L 702.9 56.27825 
+Q 704.7 56.27825 704.7 54.47825 
+L 704.7 28.862 
+Q 704.7 27.062 702.9 27.062 
+L 614.498906 27.062 
+Q 612.698906 27.062 612.698906 28.862 
+L 612.698906 54.47825 
+Q 612.698906 56.27825 614.498906 56.27825 
+z
+" style="fill: #ffffff; opacity: 0.8; stroke: #6b7280; stroke-linejoin: miter"/>
+    </g>
+    <g id="patch_9">
+     <path d="M 616.298906 37.548406 
+L 634.298906 37.548406 
+L 634.298906 31.248406 
+L 616.298906 31.248406 
+z
+" style="fill: #ff9d00; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_17">
+     <!-- Kernel -->
+     <g style="fill: #1a1a2e" transform="translate(641.498906 37.548406) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-4b" d="M 428 4666 
+L 1063 4666 
+L 1063 2591 
+L 3034 4666 
+L 3775 4666 
+L 1959 2759 
+L 3828 0 
+L 3066 0 
+L 1544 2338 
+L 1063 1825 
+L 1063 0 
+L 428 0 
+L 428 4666 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-4b"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-6e" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-6c" transform="translate(301.025391 0)"/>
+     </g>
+    </g>
+    <g id="patch_10">
+     <path d="M 616.298906 50.806531 
+L 634.298906 50.806531 
+L 634.298906 44.506531 
+L 616.298906 44.506531 
+z
+" style="fill: #6b7280; stroke: #ffffff; stroke-linejoin: miter"/>
+    </g>
+    <g id="text_18">
+     <!-- Torch (ref) -->
+     <g style="fill: #1a1a2e" transform="translate(641.498906 50.806531) scale(0.09 -0.09)">
+      <defs>
+       <path id="DejaVuSansMono-54" d="M 147 4666 
+L 3706 4666 
+L 3706 4134 
+L 2247 4134 
+L 2247 0 
+L 1613 0 
+L 1613 4134 
+L 147 4134 
+L 147 4666 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-68" d="M 3284 2169 
+L 3284 0 
+L 2706 0 
+L 2706 2169 
+Q 2706 2641 2540 2862 
+Q 2375 3084 2022 3084 
+Q 1619 3084 1401 2798 
+Q 1184 2513 1184 1978 
+L 1184 0 
+L 609 0 
+L 609 4863 
+L 1184 4863 
+L 1184 2975 
+Q 1338 3275 1600 3429 
+Q 1863 3584 2222 3584 
+Q 2756 3584 3020 3232 
+Q 3284 2881 3284 2169 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-28" d="M 2766 4856 
+Q 2350 4144 2145 3436 
+Q 1941 2728 1941 2009 
+Q 1941 1294 2145 584 
+Q 2350 -125 2766 -844 
+L 2266 -844 
+Q 1794 -100 1562 604 
+Q 1331 1309 1331 2009 
+Q 1331 2706 1562 3412 
+Q 1794 4119 2266 4856 
+L 2766 4856 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-66" d="M 3322 4863 
+L 3322 4384 
+L 2669 4384 
+Q 2359 4384 2239 4257 
+Q 2119 4131 2119 3809 
+L 2119 3500 
+L 3322 3500 
+L 3322 3053 
+L 2119 3053 
+L 2119 0 
+L 1544 0 
+L 1544 3053 
+L 609 3053 
+L 609 3500 
+L 1544 3500 
+L 1544 3744 
+Q 1544 4319 1808 4591 
+Q 2072 4863 2631 4863 
+L 3322 4863 
+z
+" transform="scale(0.015625)"/>
+       <path id="DejaVuSansMono-29" d="M 1088 4856 
+L 1588 4856 
+Q 2059 4119 2290 3412 
+Q 2522 2706 2522 2009 
+Q 2522 1306 2290 600 
+Q 2059 -106 1588 -844 
+L 1088 -844 
+Q 1503 -119 1708 590 
+Q 1913 1300 1913 2009 
+Q 1913 2722 1708 3431 
+Q 1503 4141 1088 4856 
+z
+" transform="scale(0.015625)"/>
+      </defs>
+      <use xlink:href="#DejaVuSansMono-54"/>
+      <use xlink:href="#DejaVuSansMono-6f" transform="translate(60.205078 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(120.410156 0)"/>
+      <use xlink:href="#DejaVuSansMono-63" transform="translate(180.615234 0)"/>
+      <use xlink:href="#DejaVuSansMono-68" transform="translate(240.820312 0)"/>
+      <use xlink:href="#DejaVuSansMono-20" transform="translate(301.025391 0)"/>
+      <use xlink:href="#DejaVuSansMono-28" transform="translate(361.230469 0)"/>
+      <use xlink:href="#DejaVuSansMono-72" transform="translate(421.435547 0)"/>
+      <use xlink:href="#DejaVuSansMono-65" transform="translate(481.640625 0)"/>
+      <use xlink:href="#DejaVuSansMono-66" transform="translate(541.845703 0)"/>
+      <use xlink:href="#DejaVuSansMono-29" transform="translate(602.050781 0)"/>
+     </g>
+    </g>
+   </g>
+  </g>
+  <g id="text_19">
+   <!-- kernels-community/megablocks vs Torch - Throughput -->
+   <g style="fill: #1a1a2e" transform="translate(14.4 17.141562) scale(0.14 -0.14)">
+    <defs>
+     <path id="DejaVuSansMono-Bold-2d" d="M 941 2297 
+L 2913 2297 
+L 2913 1388 
+L 941 1388 
+L 941 2297 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-75" d="M 500 1222 
+L 500 3500 
+L 1416 3500 
+L 1416 1363 
+Q 1416 984 1522 821 
+Q 1628 659 1875 659 
+Q 2122 659 2261 878 
+Q 2400 1097 2400 1491 
+L 2400 3500 
+L 3316 3500 
+L 3316 0 
+L 2400 0 
+L 2400 519 
+Q 2303 228 2070 68 
+Q 1838 -91 1509 -91 
+Q 1009 -91 754 240 
+Q 500 572 500 1222 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-69" d="M 691 3500 
+L 2503 3500 
+L 2503 703 
+L 3641 703 
+L 3641 0 
+L 447 0 
+L 447 703 
+L 1588 703 
+L 1588 2797 
+L 691 2797 
+L 691 3500 
+z
+M 1588 5203 
+L 2503 5203 
+L 2503 4134 
+L 1588 4134 
+L 1588 5203 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-79" d="M 2222 -378 
+Q 2038 -872 1780 -1098 
+Q 1522 -1325 1153 -1325 
+L 397 -1325 
+L 397 -628 
+L 769 -628 
+Q 1050 -628 1181 -533 
+Q 1313 -438 1447 -91 
+L 1516 97 
+L 184 3500 
+L 1147 3500 
+L 1947 1228 
+L 2713 3500 
+L 3675 3500 
+L 2222 -378 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-2f" d="M 2809 4666 
+L 3500 4666 
+L 1044 -594 
+L 353 -594 
+L 2809 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-76" d="M 3603 3500 
+L 2491 0 
+L 1363 0 
+L 250 3500 
+L 1178 3500 
+L 1925 769 
+L 2675 3500 
+L 3603 3500 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-54" d="M 2388 0 
+L 1466 0 
+L 1466 3859 
+L 281 3859 
+L 281 4666 
+L 3572 4666 
+L 3572 3859 
+L 2388 3859 
+L 2388 0 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-Bold-70" d="M 1381 494 
+L 1381 -1331 
+L 469 -1331 
+L 469 3500 
+L 1381 3500 
+L 1381 2975 
+Q 1525 3278 1759 3431 
+Q 1994 3584 2309 3584 
+Q 2909 3584 3240 3103 
+Q 3572 2622 3572 1747 
+Q 3572 859 3236 384 
+Q 2900 -91 2278 -91 
+Q 1997 -91 1773 54 
+Q 1550 200 1381 494 
+z
+M 2656 1753 
+Q 2656 2259 2487 2546 
+Q 2319 2834 2022 2834 
+Q 1725 2834 1553 2546 
+Q 1381 2259 1381 1753 
+Q 1381 1247 1553 959 
+Q 1725 672 2022 672 
+Q 2319 672 2487 959 
+Q 2656 1247 2656 1753 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-Bold-6b"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6e" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-69" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-79" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2f" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6d" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-65" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-61" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-62" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6c" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(1505.126953 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6b" transform="translate(1565.332031 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1625.537109 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1685.742188 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-76" transform="translate(1745.947266 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-73" transform="translate(1806.152344 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(1866.357422 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(1926.5625 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(1986.767578 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2046.972656 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-63" transform="translate(2107.177734 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2167.382812 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2227.587891 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-2d" transform="translate(2287.792969 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-20" transform="translate(2347.998047 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-54" transform="translate(2408.203125 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2468.408203 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-72" transform="translate(2528.613281 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-6f" transform="translate(2588.818359 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(2649.023438 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-67" transform="translate(2709.228516 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-68" transform="translate(2769.433594 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-70" transform="translate(2829.638672 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-75" transform="translate(2889.84375 0)"/>
+    <use xlink:href="#DejaVuSansMono-Bold-74" transform="translate(2950.048828 0)"/>
+   </g>
+  </g>
+  <g id="text_20">
+   <!-- PyTorch 2.11.0+cu130 . CPU -->
+   <g style="fill: #6b7280" transform="translate(549.071875 13.411563) scale(0.1 -0.1)">
+    <defs>
+     <path id="DejaVuSansMono-50" d="M 1247 4147 
+L 1247 2394 
+L 1978 2394 
+Q 2416 2394 2661 2625 
+Q 2906 2856 2906 3272 
+Q 2906 3688 2662 3917 
+Q 2419 4147 1978 4147 
+L 1247 4147 
+z
+M 616 4666 
+L 1978 4666 
+Q 2759 4666 3162 4311 
+Q 3566 3956 3566 3272 
+Q 3566 2581 3164 2228 
+Q 2763 1875 1978 1875 
+L 1247 1875 
+L 1247 0 
+L 616 0 
+L 616 4666 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-79" d="M 2681 1125 
+Q 2538 759 2316 163 
+Q 2006 -663 1900 -844 
+Q 1756 -1088 1540 -1209 
+Q 1325 -1331 1038 -1331 
+L 575 -1331 
+L 575 -850 
+L 916 -850 
+Q 1169 -850 1312 -703 
+Q 1456 -556 1678 56 
+L 325 3500 
+L 934 3500 
+L 1972 763 
+L 2994 3500 
+L 3603 3500 
+L 2681 1125 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2e" d="M 1528 953 
+L 2316 953 
+L 2316 0 
+L 1528 0 
+L 1528 953 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-2b" d="M 2188 3659 
+L 2188 2272 
+L 3578 2272 
+L 3578 1741 
+L 2188 1741 
+L 2188 353 
+L 1663 353 
+L 1663 1741 
+L 275 1741 
+L 275 2272 
+L 1663 2272 
+L 1663 3659 
+L 2188 3659 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-75" d="M 609 1325 
+L 609 3494 
+L 1184 3494 
+L 1184 1325 
+Q 1184 853 1351 631 
+Q 1519 409 1869 409 
+Q 2275 409 2490 695 
+Q 2706 981 2706 1516 
+L 2706 3494 
+L 3284 3494 
+L 3284 0 
+L 2706 0 
+L 2706 525 
+Q 2553 222 2289 65 
+Q 2025 -91 1672 -91 
+Q 1134 -91 871 261 
+Q 609 613 609 1325 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-33" d="M 2425 2497 
+Q 2884 2375 3128 2064 
+Q 3372 1753 3372 1288 
+Q 3372 644 2939 276 
+Q 2506 -91 1741 -91 
+Q 1419 -91 1084 -31 
+Q 750 28 428 141 
+L 428 769 
+Q 747 603 1056 522 
+Q 1366 441 1672 441 
+Q 2191 441 2469 675 
+Q 2747 909 2747 1350 
+Q 2747 1756 2469 1995 
+Q 2191 2234 1716 2234 
+L 1234 2234 
+L 1234 2753 
+L 1716 2753 
+Q 2150 2753 2394 2943 
+Q 2638 3134 2638 3475 
+Q 2638 3834 2411 4026 
+Q 2184 4219 1766 4219 
+Q 1488 4219 1191 4156 
+Q 894 4094 569 3969 
+L 569 4550 
+Q 947 4650 1242 4700 
+Q 1538 4750 1766 4750 
+Q 2447 4750 2855 4408 
+Q 3263 4066 3263 3500 
+Q 3263 3116 3048 2859 
+Q 2834 2603 2425 2497 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-43" d="M 3353 166 
+Q 3113 38 2859 -26 
+Q 2606 -91 2322 -91 
+Q 1425 -91 929 543 
+Q 434 1178 434 2328 
+Q 434 3472 932 4111 
+Q 1431 4750 2322 4750 
+Q 2606 4750 2859 4686 
+Q 3113 4622 3353 4494 
+L 3353 3847 
+Q 3122 4038 2856 4138 
+Q 2591 4238 2322 4238 
+Q 1706 4238 1400 3763 
+Q 1094 3288 1094 2328 
+Q 1094 1372 1400 897 
+Q 1706 422 2322 422 
+Q 2597 422 2861 522 
+Q 3125 622 3353 813 
+L 3353 166 
+z
+" transform="scale(0.015625)"/>
+     <path id="DejaVuSansMono-55" d="M 459 1791 
+L 459 4666 
+L 1094 4666 
+L 1094 1503 
+Q 1094 1163 1112 1017 
+Q 1131 872 1178 794 
+Q 1278 609 1467 515 
+Q 1656 422 1925 422 
+Q 2197 422 2384 515 
+Q 2572 609 2675 794 
+Q 2722 872 2740 1015 
+Q 2759 1159 2759 1497 
+L 2759 4666 
+L 3391 4666 
+L 3391 1791 
+Q 3391 1075 3302 773 
+Q 3213 472 2994 275 
+Q 2788 91 2522 0 
+Q 2256 -91 1925 -91 
+Q 1597 -91 1331 0 
+Q 1066 91 856 275 
+Q 641 469 550 776 
+Q 459 1084 459 1791 
+z
+" transform="scale(0.015625)"/>
+    </defs>
+    <use xlink:href="#DejaVuSansMono-50"/>
+    <use xlink:href="#DejaVuSansMono-79" transform="translate(60.205078 0)"/>
+    <use xlink:href="#DejaVuSansMono-54" transform="translate(120.410156 0)"/>
+    <use xlink:href="#DejaVuSansMono-6f" transform="translate(180.615234 0)"/>
+    <use xlink:href="#DejaVuSansMono-72" transform="translate(240.820312 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(301.025391 0)"/>
+    <use xlink:href="#DejaVuSansMono-68" transform="translate(361.230469 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(421.435547 0)"/>
+    <use xlink:href="#DejaVuSansMono-32" transform="translate(481.640625 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(541.845703 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(602.050781 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(662.255859 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(722.460938 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(782.666016 0)"/>
+    <use xlink:href="#DejaVuSansMono-2b" transform="translate(842.871094 0)"/>
+    <use xlink:href="#DejaVuSansMono-63" transform="translate(903.076172 0)"/>
+    <use xlink:href="#DejaVuSansMono-75" transform="translate(963.28125 0)"/>
+    <use xlink:href="#DejaVuSansMono-31" transform="translate(1023.486328 0)"/>
+    <use xlink:href="#DejaVuSansMono-33" transform="translate(1083.691406 0)"/>
+    <use xlink:href="#DejaVuSansMono-30" transform="translate(1143.896484 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1204.101562 0)"/>
+    <use xlink:href="#DejaVuSansMono-2e" transform="translate(1264.306641 0)"/>
+    <use xlink:href="#DejaVuSansMono-20" transform="translate(1324.511719 0)"/>
+    <use xlink:href="#DejaVuSansMono-43" transform="translate(1384.716797 0)"/>
+    <use xlink:href="#DejaVuSansMono-50" transform="translate(1444.921875 0)"/>
+    <use xlink:href="#DejaVuSansMono-55" transform="translate(1505.126953 0)"/>
+   </g>
+  </g>
+ </g>
+ <defs>
+  <clipPath id="p1c48eb0d56">
+   <rect x="187.08" y="10.8" width="522.12" height="235.24"/>
+  </clipPath>
+ </defs>
+<g transform="translate(10,259.0) scale(0.09375)">
+<path d="M230.721 172.7C230.183 170.673 229.313 168.75 228.146 167.008C228.396 166.091 228.587 165.159 228.714 164.217C229.543 158.241 227.471 152.77 223.567 148.537C221.452 146.225 219.185 144.698 216.784 143.761C218.36 137.018 219.157 130.117 219.161 123.193C219.161 120.03 218.982 116.932 218.682 113.88C218.526 112.356 218.337 110.836 218.115 109.32C217.428 104.847 216.408 100.431 215.064 96.11C214.183 93.2707 213.164 90.476 212.01 87.736C210.281 83.6782 208.262 79.75 205.969 75.982C204.465 73.475 202.827 71.0508 201.062 68.72C200.197 67.543 199.296 66.3938 198.358 65.274C195.58 61.898 192.561 58.7277 189.325 55.788C188.25 54.7997 187.145 53.8453 186.01 52.926C184.893 51.9943 183.751 51.0927 182.586 50.222C180.241 48.4766 177.818 46.8392 175.324 45.315C161.543 36.945 145.382 32.145 128.109 32.145C77.817 32.145 37.057 72.907 37.057 123.196C37.055 130.208 37.867 137.196 39.477 144.02C37.317 144.958 35.247 146.42 33.327 148.535C29.424 152.766 27.351 158.217 28.18 164.193C28.306 165.142 28.495 166.082 28.747 167.006C27.5811 168.749 26.7117 170.673 26.174 172.7C24.974 177.261 25.369 181.374 26.894 184.978C25.236 189.688 25.65 194.704 27.809 199.065C29.379 202.25 31.626 204.714 34.396 206.916C37.689 209.534 41.811 211.758 46.783 213.892C52.715 216.422 59.956 218.799 63.249 219.671C71.755 221.873 79.911 223.269 88.177 223.337C99.954 223.446 110.096 220.677 117.357 213.59C120.924 214.027 124.515 214.246 128.109 214.244C131.906 214.236 135.699 213.997 139.467 213.529C146.711 220.661 156.892 223.455 168.712 223.343C176.977 223.277 185.133 221.881 193.617 219.676C196.932 218.804 204.17 216.427 210.105 213.897C215.077 211.76 219.199 209.536 222.514 206.922C225.263 204.719 227.508 202.256 229.079 199.071C231.26 194.709 231.652 189.693 230.017 184.983C231.527 181.379 231.92 177.257 230.721 172.7ZM222.281 184.673C223.952 187.844 224.059 191.427 222.585 194.764C220.349 199.821 214.795 203.805 204.008 208.082C197.3 210.742 191.158 212.443 191.104 212.458C182.232 214.759 174.208 215.928 167.262 215.928C155.76 215.928 147.201 212.754 141.773 206.486C132.594 208.05 123.222 208.103 114.026 206.644C108.591 212.808 100.081 215.928 88.676 215.928C81.729 215.928 73.706 214.759 64.833 212.458C64.779 212.443 58.639 210.742 51.929 208.082C41.143 203.805 35.587 199.824 33.352 194.764C31.878 191.427 31.985 187.844 33.656 184.673C33.81 184.378 33.976 184.091 34.153 183.813C33.1516 182.309 32.4799 180.61 32.182 178.827C31.8842 177.045 31.967 175.22 32.425 173.472C33.089 170.949 34.46 168.851 36.322 167.344C35.425 165.87 34.8365 164.23 34.592 162.522C34.056 158.808 35.289 155.1 38.062 152.076C40.222 149.723 43.275 148.428 46.655 148.428H46.745C44.1965 140.259 42.9044 131.75 42.913 123.193C42.913 76.522 80.749 38.683 127.427 38.683C174.104 38.683 211.94 76.518 211.94 123.193C211.947 131.773 210.646 140.304 208.081 148.492C208.489 148.452 208.889 148.432 209.282 148.431C212.662 148.431 215.716 149.726 217.874 152.079C220.647 155.1 221.881 158.811 221.344 162.525C221.1 164.233 220.511 165.873 219.615 167.347C221.477 168.854 222.849 170.952 223.512 173.475C223.97 175.223 224.053 177.048 223.755 178.831C223.458 180.613 222.786 182.312 221.784 183.816C221.961 184.091 222.129 184.378 222.281 184.673Z" fill="white"/>
+<path d="M221.784 183.816C222.786 182.312 223.458 180.613 223.756 178.831C224.053 177.048 223.97 175.223 223.512 173.475C222.848 170.952 221.476 168.854 219.615 167.347C220.512 165.873 221.1 164.233 221.344 162.525C221.881 158.811 220.648 155.103 217.874 152.079C215.716 149.726 212.662 148.431 209.282 148.431C208.889 148.431 208.489 148.452 208.081 148.492C210.643 140.304 211.942 131.774 211.933 123.195C211.933 76.5231 174.097 38.6851 127.424 38.6851C80.75 38.6851 42.9099 76.5191 42.9099 123.195C42.9015 131.752 44.1936 140.261 46.742 148.43H46.6519C43.2719 148.43 40.219 149.724 38.06 152.077C35.287 155.098 34.0529 158.81 34.5899 162.523C34.8346 164.231 35.4231 165.872 36.3199 167.346C34.4579 168.852 33.086 170.95 32.422 173.473C31.9642 175.222 31.8817 177.047 32.1799 178.83C32.4781 180.612 33.1501 182.312 34.1519 183.816C33.9739 184.094 33.8099 184.381 33.6549 184.676C31.9849 187.847 31.877 191.43 33.352 194.767C35.588 199.824 41.1419 203.808 51.9289 208.085C58.6359 210.745 64.779 212.446 64.833 212.461C73.705 214.762 81.729 215.931 88.675 215.931C100.081 215.931 108.591 212.811 114.026 206.647C123.222 208.106 132.594 208.052 141.773 206.489C147.201 212.757 155.76 215.931 167.262 215.931C174.208 215.931 182.232 214.762 191.103 212.461C191.158 212.446 197.298 210.745 204.008 208.085C214.795 203.808 220.35 199.824 222.585 194.767C224.059 191.43 223.952 187.847 222.281 184.676C222.129 184.379 221.961 184.091 221.784 183.816ZM110.137 196.997C109.669 197.815 109.168 198.614 108.635 199.391C107.23 201.448 105.382 203.02 103.237 204.188C99.1369 206.424 93.947 207.205 88.675 207.205C80.346 207.205 71.808 205.256 67.023 204.015C66.787 203.954 37.689 195.735 41.373 188.739C41.993 187.562 43.0129 187.092 44.2979 187.092C49.4849 187.092 58.9299 194.816 62.9889 194.816C63.8959 194.816 64.5359 194.43 64.7969 193.488C66.5269 187.284 38.5039 184.676 40.8639 175.692C41.2799 174.102 42.41 173.456 43.998 173.456C50.856 173.455 66.248 185.516 69.467 185.516C69.714 185.516 69.8909 185.443 69.9869 185.291C70.0009 185.268 70.015 185.246 70.028 185.222C71.539 182.727 70.6719 180.913 60.3209 174.573L59.3269 173.968C47.9359 167.074 39.9409 162.925 44.4879 157.975C45.0109 157.404 45.7529 157.151 46.6539 157.151C47.7219 157.151 49.0149 157.508 50.4389 158.108C56.4549 160.645 64.793 167.564 68.276 170.581C68.8239 171.057 69.3683 171.538 69.9089 172.022C69.9089 172.022 74.319 176.608 76.985 176.608C77.599 176.608 78.1199 176.366 78.4729 175.768C80.364 172.58 60.9099 157.838 59.8129 151.755C59.0689 147.634 60.3349 145.546 62.6749 145.546C63.7879 145.546 65.1459 146.02 66.6449 146.971C71.2949 149.922 80.2729 165.35 83.5599 171.352C84.6619 173.363 86.5429 174.213 88.2379 174.213C91.6009 174.213 94.2299 170.87 88.5459 166.622C80.0029 160.23 83.001 149.782 87.078 149.139C87.252 149.111 87.4279 149.097 87.6029 149.097C91.3109 149.097 92.9459 155.486 92.9459 155.486C92.9459 155.486 97.7399 167.524 105.975 175.753C113.447 183.222 114.491 189.351 110.137 196.997ZM136.766 198.407L136.339 198.458L135.611 198.541C135.228 198.581 134.844 198.619 134.459 198.654L134.084 198.688L133.741 198.717L133.255 198.756L132.718 198.795L132.182 198.83L132.063 198.838C131.923 198.846 131.783 198.855 131.641 198.862L131.462 198.872C131.296 198.881 131.13 198.889 130.962 198.896L130.381 198.921L129.854 198.939L129.502 198.949H129.323C129.213 198.949 129.104 198.955 128.994 198.956H128.82C128.71 198.956 128.601 198.956 128.491 198.961L128.043 198.967H127.418C126.927 198.967 126.437 198.962 125.949 198.952L125.553 198.943C125.44 198.943 125.327 198.938 125.216 198.934L124.796 198.922L124.275 198.902L123.805 198.881L123.684 198.876L123.237 198.853C123.112 198.846 122.989 198.84 122.865 198.831L122.576 198.814C122.213 198.791 121.85 198.766 121.487 198.738L121.107 198.707C120.947 198.695 120.787 198.68 120.628 198.666C120.441 198.65 120.254 198.632 120.067 198.614C119.754 198.585 119.441 198.553 119.128 198.519H119.113C123.683 188.324 121.372 178.802 112.137 169.575C106.08 163.526 102.051 154.594 101.215 152.633C99.5229 146.828 95.045 140.375 87.608 140.375C86.979 140.375 86.351 140.425 85.73 140.523C82.472 141.036 79.624 142.911 77.592 145.733C75.396 143.002 73.262 140.831 71.332 139.605C68.422 137.76 65.5179 136.824 62.6889 136.824C59.1579 136.824 56.0019 138.274 53.8019 140.904L53.7459 140.971C53.7039 140.798 53.6639 140.625 53.6229 140.451L53.6179 140.428C53.1992 138.638 52.8477 136.833 52.5639 135.016C52.5639 135.004 52.5639 134.992 52.5579 134.98C52.5359 134.843 52.5159 134.705 52.4949 134.568C52.4334 134.162 52.3757 133.755 52.3219 133.348C52.2979 133.163 52.2719 132.978 52.2489 132.793L52.1809 132.238C52.1589 132.053 52.1409 131.885 52.1209 131.709L52.115 131.665C52.0351 130.945 51.9651 130.225 51.9049 129.503L51.8829 129.226L51.8479 128.754C51.8379 128.625 51.8279 128.495 51.8209 128.365C51.8209 128.334 51.8159 128.304 51.8149 128.275C51.7895 127.913 51.7678 127.55 51.7499 127.187C51.7399 126.998 51.7299 126.81 51.7219 126.62L51.7019 126.124L51.6969 125.974L51.6809 125.517L51.6709 125.128C51.6709 124.973 51.6629 124.818 51.6609 124.663C51.6579 124.508 51.6539 124.338 51.6529 124.174C51.6509 124.01 51.6529 123.848 51.6479 123.685C51.6439 123.521 51.6479 123.358 51.6479 123.195C51.6479 81.3421 85.5789 47.4111 127.436 47.4111C169.292 47.4111 203.222 81.3411 203.222 123.195V124.174C203.222 124.337 203.217 124.501 203.214 124.663C203.214 124.798 203.208 124.931 203.204 125.068C203.204 125.188 203.199 125.309 203.195 125.425C203.195 125.578 203.186 125.731 203.181 125.884V125.896L203.16 126.427C203.153 126.582 203.147 126.738 203.139 126.893L203.134 127.003L203.107 127.499C203.048 128.562 202.967 129.623 202.866 130.683V130.696C202.849 130.87 202.832 131.044 202.813 131.218L202.768 131.629L202.679 132.433L202.628 132.84L202.565 133.319C202.542 133.493 202.519 133.668 202.493 133.841C202.467 134.036 202.438 134.23 202.409 134.424L202.34 134.883L202.258 135.403C202.23 135.576 202.2 135.748 202.168 135.92C202.135 136.093 202.109 136.265 202.079 136.437C202.019 136.781 201.956 137.125 201.89 137.468C201.789 137.981 201.686 138.493 201.58 139.005L201.47 139.512C201.434 139.681 201.395 139.851 201.357 140.02C199.224 137.947 196.399 136.818 193.284 136.818C190.457 136.818 187.55 137.753 184.641 139.598C182.711 140.824 180.578 142.996 178.381 145.726C176.346 142.904 173.498 141.029 170.242 140.516C169.621 140.418 168.993 140.368 168.364 140.368C160.925 140.368 156.45 146.821 154.757 152.626C153.917 154.587 149.887 163.519 143.825 169.577C134.596 178.775 132.268 188.254 136.766 198.407ZM215.007 177.998L214.977 178.087C214.901 178.288 214.813 178.484 214.714 178.674C214.639 178.814 214.558 178.95 214.47 179.082C214.303 179.331 214.12 179.569 213.921 179.793C213.875 179.845 213.831 179.897 213.779 179.948C213.707 180.025 213.634 180.101 213.559 180.175C212.213 181.509 210.161 182.679 207.841 183.752C207.578 183.871 207.311 183.99 207.042 184.11L206.774 184.229C206.595 184.308 206.416 184.386 206.228 184.463C206.049 184.541 205.863 184.619 205.677 184.695L205.119 184.925C203.814 185.462 202.477 185.974 201.173 186.479L200.615 186.696L200.064 186.912C199.697 187.055 199.335 187.198 198.979 187.341L198.448 187.555L197.926 187.768L197.67 187.876C197.499 187.947 197.332 188.018 197.165 188.089C193.328 189.736 190.567 191.411 191.147 193.489C191.163 193.548 191.181 193.604 191.201 193.659C191.253 193.813 191.324 193.958 191.413 194.095C191.465 194.176 191.525 194.253 191.592 194.323C192.274 195.032 193.515 194.92 195.08 194.357C195.3 194.276 195.519 194.192 195.736 194.104L195.872 194.048C196.23 193.896 196.609 193.726 196.996 193.542C197.093 193.496 197.191 193.452 197.289 193.401C199.203 192.465 201.372 191.205 203.524 190.058C204.385 189.593 205.258 189.152 206.142 188.733C208.18 187.774 210.096 187.094 211.636 187.094C212.359 187.094 212.997 187.242 213.529 187.582L213.618 187.641C213.952 187.876 214.232 188.178 214.441 188.528C214.482 188.595 214.522 188.666 214.561 188.739C215.322 190.184 214.685 191.68 213.194 193.147C211.763 194.556 209.537 195.937 207.007 197.215C206.819 197.31 206.631 197.405 206.44 197.498C198.91 201.196 189.049 203.981 188.912 204.016C186.284 204.697 182.526 205.591 178.292 206.26L177.666 206.358L177.563 206.373C177.089 206.445 176.614 206.512 176.138 206.574C175.655 206.639 175.167 206.698 174.676 206.753L174.586 206.763C172.806 206.968 171.019 207.104 169.228 207.169H169.202C168.554 207.192 167.907 207.204 167.259 207.204H166.512C165.524 207.191 164.538 207.146 163.553 207.07C163.53 207.07 163.505 207.07 163.482 207.064C163.129 207.037 162.777 207.004 162.425 206.965C162.06 206.926 161.696 206.882 161.333 206.833C161.094 206.801 160.856 206.765 160.618 206.726C160.376 206.687 160.134 206.647 159.893 206.605L159.564 206.543L159.539 206.538C159.192 206.472 158.847 206.399 158.503 206.319C158.303 206.274 158.104 206.23 157.907 206.176L157.788 206.146C157.69 206.122 157.595 206.096 157.498 206.07L157.445 206.056L157.137 205.966C157.025 205.935 156.913 205.901 156.801 205.868L156.762 205.857L156.471 205.768C156.361 205.734 156.251 205.698 156.142 205.662L155.874 205.573L155.677 205.504C155.487 205.437 155.298 205.368 155.111 205.296L154.933 205.226L154.786 205.168C154.502 205.054 154.22 204.935 153.941 204.81L153.756 204.72L153.725 204.706C153.659 204.675 153.594 204.644 153.528 204.617C153.399 204.555 153.271 204.491 153.144 204.426L153.105 204.407L152.921 204.31C152.594 204.139 152.274 203.957 151.96 203.764L151.788 203.658C151.702 203.605 151.616 203.55 151.532 203.494L151.308 203.346L151.067 203.18L150.923 203.077C150.771 202.969 150.622 202.857 150.476 202.742L150.243 202.563C150.15 202.488 150.058 202.412 149.967 202.335C149.89 202.272 149.815 202.206 149.74 202.14L149.734 202.135C149.653 202.064 149.574 201.993 149.495 201.92C149.417 201.849 149.339 201.777 149.263 201.704L149.254 201.695C149.174 201.619 149.096 201.542 149.019 201.463C148.942 201.385 148.863 201.307 148.788 201.227C148.713 201.148 148.636 201.067 148.562 200.984C148.488 200.902 148.42 200.827 148.35 200.746L148.327 200.719C148.259 200.641 148.192 200.562 148.126 200.481C147.983 200.31 147.844 200.135 147.71 199.956C147.575 199.776 147.443 199.592 147.314 199.405L147.191 199.221C147.027 198.981 146.867 198.739 146.712 198.493C146.596 198.316 146.483 198.138 146.373 197.957C146.302 197.844 146.234 197.73 146.166 197.618L146.138 197.572C146.073 197.462 146.009 197.354 145.947 197.245C145.911 197.186 145.877 197.127 145.845 197.066C145.812 197.004 145.774 196.941 145.739 196.878L145.682 196.779L145.647 196.715C145.58 196.595 145.514 196.474 145.45 196.352C145.42 196.298 145.391 196.244 145.36 196.192L145.271 196.019L145.181 195.848C144.956 195.398 144.743 194.942 144.543 194.48L144.472 194.311C144.426 194.198 144.383 194.086 144.337 193.975C144.315 193.921 144.293 193.868 144.274 193.814C144.167 193.537 144.067 193.257 143.975 192.975C143.942 192.874 143.91 192.775 143.88 192.675C143.808 192.448 143.743 192.219 143.685 191.988C143.614 191.719 143.551 191.448 143.498 191.175C143.487 191.12 143.476 191.065 143.467 191.012C143.415 190.745 143.373 190.476 143.34 190.206C143.332 190.153 143.326 190.1 143.32 190.047L143.303 189.885C143.281 189.673 143.264 189.46 143.254 189.247C143.254 189.193 143.249 189.139 143.247 189.087C143.242 188.981 143.24 188.875 143.239 188.769C143.183 184.496 145.345 180.388 149.968 175.767C158.203 167.54 162.997 155.501 162.997 155.501C162.997 155.501 163.126 154.996 163.394 154.269C163.431 154.168 163.47 154.064 163.514 153.955C163.67 153.548 163.846 153.148 164.041 152.758L164.08 152.683C164.246 152.351 164.428 152.027 164.624 151.712C164.67 151.639 164.714 151.567 164.765 151.494C164.912 151.277 165.067 151.065 165.23 150.86C165.319 150.749 165.416 150.639 165.513 150.532C165.552 150.49 165.59 150.448 165.631 150.408C166.108 149.915 166.653 149.513 167.27 149.299L167.348 149.273C167.4 149.256 167.452 149.24 167.505 149.225C167.566 149.209 167.627 149.195 167.69 149.182L167.719 149.176C167.849 149.15 167.981 149.133 168.114 149.124H168.125C168.194 149.124 168.264 149.117 168.335 149.117C168.424 149.117 168.507 149.117 168.594 149.126C168.684 149.134 168.773 149.144 168.863 149.158C169.605 149.276 170.311 149.718 170.919 150.4C171.15 150.66 171.358 150.94 171.54 151.236C171.66 151.428 171.773 151.631 171.88 151.845C171.923 151.934 171.964 152.016 172.004 152.104C172.108 152.33 172.202 152.56 172.284 152.795C172.479 153.345 172.626 153.911 172.723 154.487C172.807 154.992 172.857 155.502 172.873 156.013C172.881 156.286 172.881 156.563 172.873 156.842C172.819 158.14 172.553 159.421 172.086 160.634C172.044 160.745 171.997 160.857 171.952 160.969C171.86 161.195 171.759 161.417 171.65 161.634C171.569 161.799 171.484 161.965 171.392 162.13C171.332 162.24 171.269 162.35 171.206 162.46C171.045 162.734 170.871 163.006 170.684 163.277L170.571 163.439C170.129 164.055 169.637 164.633 169.099 165.167C168.569 165.698 168.001 166.189 167.4 166.637C166.798 167.083 166.233 167.577 165.711 168.114C164.208 169.691 163.858 171.083 164.196 172.138C164.25 172.304 164.321 172.465 164.407 172.617C164.508 172.791 164.628 172.951 164.764 173.097L164.817 173.152L164.871 173.206C164.925 173.258 164.982 173.309 165.043 173.359L165.103 173.407C165.248 173.519 165.402 173.619 165.563 173.707C165.61 173.732 165.652 173.757 165.705 173.781C165.879 173.866 166.058 173.939 166.242 173.998C166.293 174.015 166.344 174.03 166.396 174.046L166.461 174.063L166.551 174.087L166.628 174.106L166.712 174.124L166.795 174.141L166.874 174.154C166.932 174.164 166.992 174.174 167.052 174.181L167.109 174.19L167.213 174.2L167.277 174.207L167.382 174.214H167.444L167.554 174.22H167.9L167.999 174.214L168.113 174.207L168.252 174.194L168.382 174.179C168.412 174.179 168.442 174.171 168.472 174.165C168.872 174.107 169.264 174.001 169.639 173.849L169.798 173.782C169.887 173.743 169.977 173.702 170.059 173.658C170.235 173.57 170.406 173.47 170.57 173.361C170.799 173.211 171.015 173.043 171.217 172.858C171.265 172.815 171.312 172.769 171.358 172.725C171.381 172.703 171.403 172.682 171.425 172.658C171.469 172.613 171.514 172.569 171.558 172.52C171.878 172.168 172.155 171.78 172.383 171.363C174.34 167.804 176.391 164.298 178.534 160.849L178.828 160.378L179.125 159.907C179.273 159.668 179.423 159.433 179.572 159.199L179.722 158.965C180.22 158.185 180.726 157.41 181.241 156.641L181.546 156.185C182.158 155.278 182.768 154.396 183.373 153.558L183.674 153.143C184.332 152.236 185.017 151.348 185.728 150.482L186.01 150.144C186.057 150.088 186.1 150.032 186.151 149.978C186.244 149.868 186.337 149.761 186.428 149.657C186.474 149.604 186.517 149.552 186.566 149.5L186.834 149.198L186.968 149.051C187.103 148.906 187.235 148.767 187.365 148.634C187.455 148.544 187.538 148.455 187.624 148.371C188.131 147.853 188.69 147.388 189.293 146.985L189.433 146.895C189.567 146.805 189.706 146.721 189.848 146.645C192.212 145.303 194.169 145.204 195.296 146.331C195.978 147.013 196.356 148.144 196.335 149.718C196.335 149.787 196.335 149.857 196.33 149.929V150.006C196.33 150.078 196.324 150.15 196.318 150.223C196.318 150.313 196.308 150.402 196.299 150.492C196.29 150.581 196.285 150.649 196.276 150.729C196.276 150.751 196.272 150.774 196.268 150.798C196.262 150.867 196.253 150.938 196.243 151.009C196.243 151.03 196.243 151.052 196.235 151.074C196.224 151.169 196.21 151.263 196.194 151.357C196.183 151.447 196.168 151.531 196.152 151.619L196.126 151.768C196.1 151.91 196.067 152.05 196.026 152.188C195.948 152.447 195.854 152.7 195.743 152.946C195.588 153.284 195.417 153.613 195.229 153.933C195.125 154.111 195.018 154.286 194.907 154.459C194.793 154.638 194.673 154.819 194.549 155.002C194.233 155.454 193.905 155.897 193.564 156.33L193.408 156.527C192.852 157.22 192.278 157.899 191.686 158.562L191.499 158.772C191.247 159.053 190.991 159.336 190.729 159.62L190.532 159.834C190.401 159.977 190.264 160.12 190.132 160.264C190.001 160.407 189.864 160.552 189.726 160.697L189.315 161.13L188.898 161.566L188.478 162.002C188.196 162.294 187.913 162.586 187.628 162.878C183.573 167.037 179.301 171.182 177.855 173.766C177.758 173.934 177.671 174.108 177.593 174.285C177.387 174.755 177.301 175.157 177.36 175.482C177.379 175.589 177.416 175.691 177.471 175.785C177.552 175.926 177.651 176.056 177.766 176.172C177.819 176.224 177.875 176.272 177.934 176.316C178.232 176.528 178.591 176.637 178.957 176.627H179.071L179.188 176.618L179.305 176.605L179.402 176.591C179.415 176.589 179.429 176.587 179.442 176.583L179.531 176.566L179.554 176.561L179.653 176.54L179.688 176.531C179.723 176.522 179.757 176.513 179.792 176.503C179.827 176.493 179.875 176.48 179.917 176.466C180.093 176.413 180.265 176.35 180.434 176.278C180.523 176.242 180.61 176.203 180.696 176.161C180.741 176.141 180.786 176.12 180.828 176.098L180.962 176.032C181.282 175.866 181.594 175.685 181.898 175.491L182.031 175.401C182.076 175.373 182.121 175.344 182.164 175.312L182.297 175.223L182.368 175.174L182.56 175.039C182.739 174.916 182.906 174.789 183.075 174.66L183.09 174.648L183.359 174.44C183.726 174.15 184.074 173.858 184.39 173.583L184.6 173.399L184.619 173.381L184.729 173.284C184.987 173.052 185.217 172.836 185.408 172.658L185.487 172.581C185.556 172.516 185.619 172.455 185.676 172.403L185.788 172.292L185.828 172.253L185.839 172.242L185.956 172.125L186.03 172.048L186.039 172.041L186.074 172.009L186.118 171.969L186.132 171.956L186.169 171.922L186.373 171.743L186.487 171.641C186.548 171.588 186.607 171.534 186.666 171.479L186.802 171.358C186.827 171.338 186.851 171.316 186.876 171.294L187.019 171.169L187.229 170.984L187.341 170.887C187.776 170.509 188.305 170.052 188.913 169.537L189.162 169.326L189.573 168.981L189.994 168.63C190.544 168.173 191.136 167.688 191.762 167.185L192.173 166.855C192.523 166.576 192.882 166.292 193.246 166.006C193.393 165.891 193.542 165.776 193.694 165.662C194.066 165.373 194.44 165.086 194.817 164.803C195.675 164.155 196.56 163.506 197.456 162.874L197.84 162.606C198.109 162.421 198.377 162.235 198.645 162.054L198.888 161.89C199.367 161.565 199.853 161.248 200.343 160.939L200.586 160.786L200.827 160.636C201.069 160.486 201.309 160.339 201.548 160.196L201.787 160.053L202.265 159.775L202.734 159.506L202.829 159.454L203.2 159.25C203.355 159.166 203.509 159.085 203.663 159.006L203.892 158.888L204.115 158.776C204.193 158.739 204.27 158.7 204.346 158.663C204.848 158.415 205.36 158.187 205.88 157.979C206.021 157.919 206.161 157.865 206.3 157.818L206.71 157.674C206.833 157.633 206.953 157.594 207.068 157.559L207.108 157.547C207.17 157.527 207.232 157.509 207.293 157.493L207.311 157.488C207.439 157.451 207.566 157.419 207.691 157.389H207.7C208.054 157.304 208.414 157.243 208.777 157.206C208.944 157.189 209.111 157.18 209.279 157.181H209.363C209.475 157.181 209.583 157.188 209.69 157.199C209.739 157.199 209.788 157.209 209.836 157.215H209.856C209.904 157.221 209.952 157.228 210 157.239C210.047 157.248 210.095 157.256 210.141 157.267H210.156C210.203 157.277 210.245 157.289 210.294 157.303C210.548 157.374 210.79 157.484 211.012 157.628C211.121 157.699 211.223 157.779 211.317 157.868L211.344 157.894C211.362 157.91 211.379 157.927 211.395 157.944L211.444 157.997C211.846 158.418 212.178 158.901 212.428 159.427L212.466 159.517C212.551 159.717 212.618 159.924 212.666 160.135C212.808 160.781 212.753 161.455 212.508 162.07C212.415 162.318 212.302 162.557 212.169 162.785C211.858 163.309 211.489 163.796 211.07 164.237L210.981 164.332C210.848 164.472 210.71 164.612 210.565 164.752C210.501 164.815 210.434 164.877 210.367 164.94L210.162 165.129L210.055 165.224C209.797 165.454 209.532 165.677 209.263 165.893C209.1 166.025 208.936 166.154 208.77 166.281C208.184 166.729 207.587 167.161 206.979 167.578C206.612 167.83 206.242 168.077 205.869 168.321C204.95 168.924 204.021 169.512 203.083 170.084C201.115 171.294 198.934 172.588 196.609 173.995L196.007 174.36C195.348 174.762 194.726 175.146 194.14 175.512L193.845 175.697L193.287 176.055C192.917 176.292 192.548 176.531 192.179 176.77L191.882 176.966C191.737 177.06 191.593 177.156 191.449 177.252L191.308 177.342L190.876 177.633L190.647 177.79L190.379 177.976L190.13 178.149C189.713 178.444 189.325 178.725 188.968 178.992L188.834 179.094C188.624 179.253 188.416 179.415 188.211 179.58C187.902 179.829 187.62 180.067 187.367 180.296L187.243 180.409C187.172 180.474 187.102 180.539 187.035 180.603C186.989 180.648 186.946 180.693 186.898 180.736L186.834 180.8C186.691 180.944 186.551 181.091 186.416 181.242L186.35 181.318C186.203 181.488 186.075 181.651 185.963 181.81L185.913 181.881C185.825 182.009 185.744 182.141 185.671 182.277C185.652 182.311 185.635 182.345 185.618 182.379L185.569 182.481L185.536 182.555L185.515 182.605L185.498 182.65L185.475 182.711C185.413 182.88 185.37 183.056 185.345 183.234L185.337 183.296L185.331 183.354V183.669C185.331 183.695 185.331 183.721 185.338 183.749L185.343 183.797C185.343 183.823 185.349 183.848 185.353 183.876C185.357 183.902 185.364 183.949 185.372 183.986V183.991C185.379 184.026 185.386 184.06 185.395 184.095C185.404 184.13 185.413 184.17 185.424 184.206C185.443 184.277 185.467 184.347 185.492 184.417C185.508 184.459 185.523 184.5 185.54 184.541C185.54 184.549 185.546 184.558 185.55 184.566L185.586 184.647L185.636 184.758C185.69 184.873 185.749 184.985 185.813 185.094L185.879 185.208L185.947 185.322C185.959 185.341 185.973 185.359 185.988 185.376L186.01 185.399L186.035 185.422L186.061 185.442C186.099 185.469 186.14 185.49 186.183 185.505C186.206 185.513 186.23 185.519 186.254 185.525C186.831 185.655 188.017 185.178 189.593 184.346C189.682 184.298 189.78 184.248 189.875 184.196L190.355 183.934L190.589 183.804C190.756 183.715 190.926 183.614 191.1 183.515L191.417 183.336C193.5 182.137 195.988 180.597 198.56 179.093C198.801 178.952 199.043 178.811 199.285 178.672L199.771 178.361C200.335 178.038 200.902 177.719 201.471 177.404C202.188 177.01 202.91 176.626 203.639 176.254L204.115 176.013C204.431 175.857 204.744 175.705 205.053 175.557C205.651 175.273 206.256 175.003 206.868 174.748L207.203 174.612L207.243 174.596C209.018 173.893 210.627 173.459 211.929 173.459C212.21 173.456 212.492 173.48 212.769 173.528H212.778C212.867 173.544 212.948 173.562 213.031 173.582H213.046C213.259 173.636 213.466 173.713 213.662 173.812C213.937 173.954 214.184 174.143 214.393 174.371C214.489 174.477 214.574 174.592 214.649 174.714C214.789 174.929 214.899 175.162 214.978 175.406C215.01 175.501 215.038 175.594 215.067 175.693C215.278 176.45 215.257 177.253 215.007 177.998Z" fill="#FF9D00"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M203.21 123.685V123.194C203.21 81.34 169.292 47.411 127.435 47.411C85.5791 47.411 51.648 81.342 51.648 123.194V123.358C51.646 123.467 51.645 123.576 51.648 123.685C51.6529 123.848 51.6546 124.011 51.653 124.174L51.6581 124.534L51.661 124.663C51.661 124.723 51.6631 124.782 51.6651 124.842C51.6681 124.937 51.67 125.033 51.67 125.128L51.681 125.517L51.697 125.974L51.702 126.124L51.722 126.597V126.62C51.73 126.805 51.7401 126.989 51.7491 127.173L51.75 127.187C51.76 127.375 51.7701 127.564 51.7821 127.753C51.7921 127.927 51.802 128.101 51.815 128.275L51.8171 128.306C51.8258 128.455 51.8358 128.605 51.847 128.754L51.85 128.794L51.883 129.226L51.8861 129.254C51.8921 129.338 51.898 129.422 51.906 129.503C51.9658 130.224 52.0355 130.945 52.1151 131.664L52.12 131.709L52.181 132.238L52.2491 132.793L52.299 133.17L52.322 133.347C52.3753 133.755 52.433 134.162 52.495 134.568L52.4991 134.595L52.558 134.979C52.8435 136.808 53.1971 138.626 53.618 140.429L53.6231 140.451L53.655 140.586L53.746 140.971L53.802 140.904C56.002 138.274 59.158 136.824 62.689 136.824C65.519 136.824 68.4221 137.76 71.3321 139.605C73.2621 140.831 75.3961 143.002 77.5921 145.733C79.6241 142.911 82.4721 141.035 85.7301 140.523C86.3513 140.425 86.9792 140.376 87.6081 140.375C95.0441 140.375 99.523 146.828 101.215 152.633C102.051 154.594 106.08 163.526 112.156 169.568C121.392 178.795 123.703 188.316 119.132 198.511H119.148C119.459 198.546 119.772 198.578 120.087 198.607C120.274 198.625 120.46 198.643 120.648 198.659L120.714 198.665L121.127 198.7L121.507 198.73C121.869 198.758 122.232 198.784 122.596 198.807L122.885 198.824L123.114 198.838L123.256 198.846L123.703 198.869L123.825 198.874L124.294 198.895L124.816 198.915L125.235 198.927L125.305 198.929C125.394 198.933 125.483 198.936 125.572 198.936L125.668 198.939C126.258 198.953 126.847 198.96 127.437 198.959H128.063L128.51 198.954C128.62 198.949 128.729 198.949 128.84 198.949H129.014L129.165 198.945C129.224 198.943 129.283 198.941 129.343 198.941H129.522L129.873 198.932L130.401 198.914L130.982 198.888C131.15 198.882 131.316 198.873 131.482 198.865L131.661 198.854L131.927 198.84L132.083 198.831L132.201 198.823L132.738 198.788L133.274 198.749L133.761 198.71L134.103 198.681L134.479 198.647C135.107 198.591 135.733 198.525 136.359 198.45L136.786 198.399C132.287 188.247 134.616 178.767 143.813 169.577C149.876 163.519 153.905 154.587 154.745 152.625C156.438 146.821 160.914 140.368 168.352 140.368C168.981 140.368 169.61 140.418 170.231 140.516C173.486 141.028 176.334 142.904 178.369 145.726C180.566 142.996 182.699 140.823 184.63 139.597C187.539 137.753 190.445 136.817 193.272 136.817C196.388 136.817 199.212 137.947 201.345 140.02C201.384 139.851 201.422 139.682 201.459 139.512L201.568 139.006C201.607 138.821 201.646 138.636 201.683 138.451C201.749 138.124 201.815 137.797 201.878 137.467C201.944 137.125 202.007 136.781 202.067 136.437L202.098 136.251C202.117 136.141 202.135 136.031 202.156 135.92C202.19 135.748 202.218 135.576 202.246 135.402L202.257 135.336L202.328 134.883L202.398 134.424V134.42C202.449 134.081 202.497 133.742 202.542 133.403L202.553 133.319L202.616 132.841L202.667 132.433L202.757 131.629L202.792 131.306L202.801 131.218C202.82 131.044 202.838 130.87 202.854 130.696V130.682C202.867 130.544 202.881 130.405 202.893 130.266C202.964 129.478 203.024 128.686 203.072 127.891C203.081 127.761 203.088 127.63 203.096 127.499V127.493L203.122 127.002L203.128 126.892C203.144 126.56 203.158 126.228 203.169 125.896V125.884L203.174 125.754C203.179 125.645 203.183 125.535 203.183 125.425L203.185 125.381C203.189 125.278 203.193 125.172 203.193 125.067L203.196 124.977C203.199 124.872 203.202 124.768 203.202 124.663L203.204 124.574C203.207 124.441 203.21 124.307 203.21 124.174V123.685ZM108.638 199.391C114.64 190.59 114.214 183.984 105.98 175.754C97.7441 167.523 92.951 155.487 92.951 155.487C92.951 155.487 91.1621 148.496 87.0821 149.138C83.0021 149.78 80.0091 160.227 88.5521 166.622C97.0941 173.017 86.8521 177.353 83.5641 171.352C80.2761 165.35 71.299 149.923 66.645 146.972C61.991 144.021 58.718 145.675 59.815 151.757C60.36 154.776 65.4281 159.929 70.1631 164.743C74.9671 169.627 79.428 174.163 78.474 175.768C76.581 178.955 69.9141 172.023 69.9141 172.023C69.9141 172.023 49.038 153.025 44.494 157.976C40.304 162.539 46.765 166.418 56.7211 172.397C57.5671 172.905 58.4391 173.429 59.3321 173.969C70.7231 180.865 71.609 182.684 69.992 185.293C69.395 186.257 65.582 183.968 60.892 181.153C52.897 176.352 42.3551 170.023 40.8661 175.688C39.5781 180.591 47.334 183.595 54.368 186.32C60.228 188.59 65.5881 190.666 64.7991 193.484C63.9821 196.406 59.5531 193.969 54.7121 191.305C49.2771 188.314 43.3221 185.038 41.3731 188.735C37.6901 195.725 66.7831 203.954 67.0231 204.015C76.4231 206.453 100.295 211.619 108.638 199.391ZM147.303 199.391C141.301 190.59 141.727 183.984 149.962 175.754C158.197 167.523 162.99 155.487 162.99 155.487C162.99 155.487 164.779 148.496 168.859 149.138C172.939 149.78 175.932 160.227 167.39 166.622C158.847 173.017 169.089 177.353 172.377 171.352C175.666 165.35 184.637 149.923 189.291 146.972C193.945 144.021 197.22 145.675 196.122 151.757C195.578 154.776 190.509 159.929 185.774 164.744C180.97 169.628 176.509 174.163 177.462 175.768C179.355 178.955 186.027 172.019 186.027 172.019C186.027 172.019 206.902 153.022 211.448 157.973C215.637 162.535 209.176 166.415 199.219 172.394C198.348 172.917 197.478 173.441 196.609 173.966C185.218 180.862 184.332 182.681 185.948 185.289C186.546 186.254 190.359 183.964 195.048 181.149C203.044 176.349 213.586 170.019 215.075 175.685C216.364 180.588 208.607 183.592 201.573 186.317C195.713 188.587 190.353 190.663 191.141 193.481C191.957 196.402 196.385 193.965 201.225 191.301C206.66 188.31 212.616 185.032 214.564 188.732C218.248 195.726 189.15 203.947 188.915 204.007C179.515 206.453 155.643 211.619 147.303 199.391Z" fill="#FFD21E"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M152.047 102.567C153.229 102.985 154.108 104.257 154.944 105.468C156.074 107.104 157.126 108.627 158.74 107.769C160.644 106.756 162.205 105.202 163.225 103.302C164.246 101.402 164.681 99.2427 164.475 97.096C164.321 95.4908 163.813 93.9398 162.987 92.5548C162.161 91.1697 161.038 89.985 159.7 89.0862C158.361 88.1874 156.839 87.5968 155.245 87.3569C153.65 87.117 152.022 87.2339 150.478 87.699C148.934 88.1639 147.513 88.9653 146.316 90.0455C145.119 91.1257 144.176 92.4578 143.556 93.946C142.936 95.4342 142.653 97.0415 142.728 98.652C142.804 100.263 143.235 101.836 143.992 103.26C144.74 104.667 146.4 104.003 148.152 103.302C149.525 102.753 150.956 102.181 152.047 102.567ZM100.672 102.567C99.49 102.985 98.611 104.258 97.775 105.468C96.645 107.105 95.592 108.627 93.979 107.769C91.5845 106.501 89.7482 104.386 88.8278 101.838C87.9075 99.2895 87.9692 96.4896 89.0008 93.9841C90.0324 91.4786 91.9601 89.4471 94.408 88.2855C96.856 87.1239 99.6488 86.9156 102.242 87.701C104.307 88.3228 106.141 89.5427 107.513 91.2065C108.885 92.8704 109.732 94.9035 109.949 97.049C110.165 99.1945 109.74 101.356 108.728 103.26C107.979 104.667 106.319 104.003 104.567 103.303C103.193 102.753 101.764 102.181 100.672 102.567ZM144.099 149.318C152.242 142.903 155.233 132.429 155.233 125.977C155.233 120.877 151.802 122.482 146.309 125.202L145.999 125.355C140.957 127.852 134.245 131.177 126.877 131.177C119.508 131.177 112.796 127.852 107.755 125.354C102.084 122.545 98.527 120.783 98.527 125.978C98.527 132.634 101.709 143.563 110.443 149.912C111.596 147.573 113.219 145.497 115.211 143.813C117.202 142.129 119.52 140.874 122.018 140.126C122.89 139.866 123.788 141.367 124.707 142.904C125.594 144.386 126.501 145.902 127.423 145.902C128.406 145.902 129.371 144.408 130.314 142.95C131.299 141.425 132.26 139.94 133.189 140.237C137.864 141.738 141.775 144.993 144.099 149.318Z" fill="#32343D"/>
+<path d="M144.097 149.317C139.856 152.659 134.219 154.9 126.878 154.9C119.981 154.9 114.587 152.922 110.443 149.911C111.596 147.572 113.219 145.495 115.211 143.812C117.202 142.128 119.52 140.873 122.018 140.125C123.73 139.614 125.545 145.901 127.423 145.901C129.433 145.901 131.37 139.655 133.189 140.236C137.863 141.738 141.773 144.993 144.097 149.317Z" fill="#FF323D"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M81.2 111.64C80.2312 112.288 79.1173 112.687 77.9572 112.801C76.7971 112.916 75.6267 112.742 74.55 112.295C73.6893 111.94 72.9072 111.418 72.2488 110.759C71.5903 110.101 71.0684 109.319 70.713 108.458C70.267 107.381 70.0935 106.211 70.2082 105.051C70.3228 103.891 70.7219 102.777 71.37 101.808C72.1488 100.642 73.2558 99.7333 74.5512 99.1967C75.8466 98.6601 77.272 98.5197 78.6471 98.7935C80.0223 99.0672 81.2853 99.7427 82.2764 100.734C83.2675 101.726 83.9422 102.99 84.215 104.365C84.4883 105.74 84.3477 107.165 83.8113 108.46C83.2748 109.755 82.3654 110.861 81.2 111.64ZM182.613 111.64C181.644 112.288 180.53 112.687 179.37 112.801C178.209 112.916 177.039 112.742 175.962 112.295C175.101 111.939 174.319 111.418 173.661 110.759C173.003 110.101 172.481 109.319 172.125 108.458C171.68 107.381 171.507 106.211 171.621 105.051C171.736 103.891 172.135 102.777 172.782 101.808C173.364 100.936 174.133 100.205 175.032 99.6658C175.931 99.1269 176.938 98.7942 177.981 98.6917C179.025 98.5891 180.078 98.7193 181.064 99.0728C182.051 99.4264 182.947 99.9944 183.688 100.736C184.68 101.727 185.355 102.99 185.628 104.365C185.902 105.74 185.761 107.165 185.224 108.46C184.687 109.755 183.779 110.861 182.613 111.64Z" fill="#FFAD03"/>
+</g></svg>
diff --git a/readme_example.py b/readme_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..88a016128ab2e128c13739284ffab80e48a7dbd4
--- /dev/null
+++ b/readme_example.py
@@ -0,0 +1,51 @@
+# /// script
+# requires-python = "==3.10"
+# dependencies = [
+#     "numpy",
+#     "kernels",
+#     "torch"
+# ]
+# ///
+
+import torch
+from collections import namedtuple
+
+from kernels import get_kernel
+
+# Make reproducible
+torch.manual_seed(42)
+torch.cuda.manual_seed(42)
+
+# Download optimized kernels from the Hugging Face hub
+megablocks = get_kernel("kernels-community/megablocks")
+print("MegaBlocks kernel downloaded successfully.")
+
+model = megablocks.layers.MegaBlocksMoeMLP()
+model.experts = namedtuple("Experts", ["gate_up_proj", "gate_down_proj", "down_proj", "hidden_size"])
+print("MegaBlocksMoeMLP instance created successfully.")
+
+# Config
+ne, hs, isz = 128, 1152, 3072
+
+# Router with proper initialization
+model.router = torch.nn.Linear(hs, ne, device="cuda")
+torch.nn.init.kaiming_uniform_(model.router.weight)
+
+# Expert layers with realistic weights
+e = model.experts
+e.gate_up_proj = torch.nn.Parameter(torch.randn(ne, hs, isz, device="cuda") * 0.02)
+e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+e.down_proj = torch.nn.Parameter(torch.randn(ne, 1536, hs, device="cuda") * 0.02)
+e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+e.hidden_size = hs
+print("Expert layers initialized successfully.")
+
+# Test with normalized input
+x = torch.randn(1, 1, hs, device="cuda") * 0.1
+output, expert_weights = model(x)
+print("Model forward pass completed successfully.")
+
+print(f"Output shape: {output.shape}")
+print(f"Output range: [{output.min():.3f}, {output.max():.3f}]")
+print(f"Output: {output.flatten()[:10]}")
+print(f"Expert weights sum: {expert_weights.sum():.3f}")
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae33a06c42a90a24c322404dbb25186f836c452b
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,110 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import List, Optional
+
+import pytest
+# from composer.utils import reproducibility
+
+# Allowed options for pytest.mark.world_size()
+WORLD_SIZE_OPTIONS = (1, 2)
+
+# Enforce deterministic mode before any tests start.
+# reproducibility.configure_deterministic_mode()
+
+# TODO: allow plugind when deps resolved
+
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    # 'tests.fixtures.autouse',
+    'tests.fixtures.fixtures',
+]
+
+
+def _get_world_size(item: pytest.Item):
+    """Returns the world_size of a test, defaults to 1."""
+    _default = pytest.mark.world_size(1).mark
+    return item.get_closest_marker('world_size', default=_default).args[0]
+
+
+def _get_option(
+    config: pytest.Config,
+    name: str,
+    default: Optional[str] = None,
+) -> str:  # type: ignore
+    val = config.getoption(name)
+    if val is not None:
+        assert isinstance(val, str)
+        return val
+    val = config.getini(name)
+    if val == []:
+        val = None
+    if val is None:
+        if default is None:
+            pytest.fail(f'Config option {name} is not specified but is required',)
+        val = default
+    assert isinstance(val, str)
+    return val
+
+
+def _add_option(
+    parser: pytest.Parser,
+    name: str,
+    help: str,
+    choices: Optional[list[str]] = None,
+):
+    parser.addoption(
+        f'--{name}',
+        default=None,
+        type=str,
+        choices=choices,
+        help=help,
+    )
+    parser.addini(
+        name=name,
+        help=help,
+        type='string',
+        default=None,
+    )
+
+
+def pytest_collection_modifyitems(
+    config: pytest.Config,
+    items: List[pytest.Item],
+) -> None:
+    """Filter tests by world_size (for multi-GPU tests)"""
+    world_size = int(os.environ.get('WORLD_SIZE', '1'))
+    print(f'world_size={world_size}')
+
+    conditions = [
+        lambda item: _get_world_size(item) == world_size,
+    ]
+
+    # keep items that satisfy all conditions
+    remaining = []
+    deselected = []
+    for item in items:
+        if all(condition(item) for condition in conditions):
+            remaining.append(item)
+        else:
+            deselected.append(item)
+
+    if deselected:
+        config.hook.pytest_deselected(items=deselected)
+        items[:] = remaining
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    _add_option(
+        parser,
+        'seed',
+        help="""\
+        Rank zero seed to use. `reproducibility.seed_all(seed + dist.get_global_rank())` will be invoked
+        before each test.""",
+    )
+
+
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
+    if exitstatus == 5:
+        session.exitstatus = 0  # Ignore no-test-ran errors
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
new file mode 100644
index 0000000000000000000000000000000000000000..6805f3c1bd55cfc7130ef882b97962191a25d8f3
--- /dev/null
+++ b/tests/fixtures/autouse.py
@@ -0,0 +1,107 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+import logging
+import os
+
+import composer
+import pytest
+import torch
+from composer.devices import DeviceCPU, DeviceGPU
+from composer.utils import dist, reproducibility
+
+
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+
+
+@pytest.fixture(autouse=True)
+def reset_mlflow_tracking_dir():
+    """Reset MLFlow tracking dir so it doesn't persist across tests."""
+    try:
+        import mlflow
+        mlflow.set_tracking_uri(None)  # type: ignore
+    except ModuleNotFoundError:
+        # MLFlow not installed
+        pass
+
+
+@pytest.fixture(scope='session')
+def cleanup_dist():
+    """Ensure all dist tests clean up resources properly."""
+    yield
+    # Avoid race condition where a test is still writing to a file on one rank
+    # while the file system is being torn down on another rank.
+    dist.barrier()
+
+
+@pytest.fixture(autouse=True, scope='session')
+def configure_dist(request: pytest.FixtureRequest):
+    # Configure dist globally when the world size is greater than 1,
+    # so individual tests that do not use the trainer
+    # do not need to worry about manually configuring dist.
+
+    if dist.get_world_size() == 1:
+        return
+
+    device = None
+
+    for item in request.session.items:
+        device = DeviceCPU() if item.get_closest_marker('gpu') is None else DeviceGPU()
+        break
+
+    assert device is not None
+
+    if not dist.is_initialized():
+        dist.initialize_dist(device, timeout=300.0)
+    # Hold PyTest until all ranks have reached this barrier. Ensure that no rank starts
+    # any test before other ranks are ready to start it, which could be a cause of random timeouts
+    # (e.g. rank 1 starts the next test while rank 0 is finishing up the previous test).
+    dist.barrier()
+
+
+@pytest.fixture(autouse=True)
+def set_log_levels():
+    """Ensures all log levels are set to DEBUG."""
+    logging.basicConfig()
+    logging.getLogger(composer.__name__).setLevel(logging.DEBUG)
+
+
+@pytest.fixture(autouse=True)
+def seed_all(rank_zero_seed: int, monkeypatch: pytest.MonkeyPatch):
+    """Monkeypatch reproducibility.
+
+    Make get_random_seed to always return the rank zero seed, and set the random seed before each test to the rank local
+    seed.
+    """
+    monkeypatch.setattr(
+        reproducibility,
+        'get_random_seed',
+        lambda: rank_zero_seed,
+    )
+    reproducibility.seed_all(rank_zero_seed + dist.get_global_rank())
+
+
+@pytest.fixture(autouse=True)
+def remove_run_name_env_var():
+    # Remove environment variables for run names in unit tests
+    composer_run_name = os.environ.get('COMPOSER_RUN_NAME')
+    run_name = os.environ.get('RUN_NAME')
+
+    if 'COMPOSER_RUN_NAME' in os.environ:
+        del os.environ['COMPOSER_RUN_NAME']
+    if 'RUN_NAME' in os.environ:
+        del os.environ['RUN_NAME']
+
+    yield
+
+    if composer_run_name is not None:
+        os.environ['COMPOSER_RUN_NAME'] = composer_run_name
+    if run_name is not None:
+        os.environ['RUN_NAME'] = run_name
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
new file mode 100644
index 0000000000000000000000000000000000000000..4039db706cf766ceebf7ad82d1a97f05c853f3ec
--- /dev/null
+++ b/tests/fixtures/fixtures.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from tests.conftest import _get_option
+
+
+@pytest.fixture
+def rank_zero_seed(pytestconfig: pytest.Config) -> int:
+    """Read the rank_zero_seed from the CLI option."""
+    seed = _get_option(pytestconfig, 'seed', default='0')
+    return int(seed)
diff --git a/tests/layer_test.py b/tests/layer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cee0006e6cf5b0e2de270ecd73227a06c8f2998
--- /dev/null
+++ b/tests/layer_test.py
@@ -0,0 +1,53 @@
+import torch
+
+from collections import namedtuple
+
+
+def test_megablocks_moe_mlp_import():
+    """Test if MegaBlocksMoeMLP can be imported."""
+    from megablocks.layers import MegaBlocksMoeMLP
+
+    assert MegaBlocksMoeMLP is not None, "MegaBlocksMoeMLP import failed."
+
+
+def test_megablocks_moe_mlp_functionality():
+    """Test the functionality of MegaBlocksMoeMLP."""
+    from megablocks.layers import MegaBlocksMoeMLP
+
+    # Create a simple instance of MegaBlocksMoeMLP
+    model = MegaBlocksMoeMLP()
+
+    # add experts attribute to the model
+    model.experts = namedtuple(
+        "Experts",
+        [
+            "gate_up_proj",
+            "gate_down_proj",
+            "down_proj",
+            "hidden_size",
+        ],
+    )
+
+    num_experts = 128
+    hidden_size = 1152
+    intermediate_size = 3072
+
+    # Shorter names for reading convenience
+    ne, hs, isz = num_experts, hidden_size, intermediate_size
+
+    model.router = torch.nn.Linear(hs, ne).cuda()
+    model.router.weight.data.fill_(1)
+
+    e = model.experts
+    e.gate_up_proj = torch.nn.Parameter(torch.ones(ne, hs, isz, device="cuda"))
+    e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+    e.down_proj = torch.nn.Parameter(torch.ones(ne, 1536, hs, device="cuda"))
+    e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+    e.hidden_size = hs
+
+    # Create dummy input data
+    x = torch.randn(1, 1, 1152).to(torch.device("cuda"))
+    output, expert_weights_out = model(x)
+
+    # print("Output shape:", output.shape)
+    assert output.shape == (1, 1, 1152), "Output shape mismatch."
diff --git a/tests/layers/architectures.py b/tests/layers/architectures.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8710ba1c7ec095d615cdf08428fa1a4534f6b11
--- /dev/null
+++ b/tests/layers/architectures.py
@@ -0,0 +1,53 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.nn.functional as F
+
+from megablocks._layers.arguments import Arguments
+
+
+class FFN(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                args.ffn_hidden_size,
+                args.hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+
+    def forward(self, x):
+        return torch.matmul(
+            F.gelu(torch.matmul(x, self.w1), approximate='tanh'),
+            self.w2,
+        )
+
+
+class GLU(FFN):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+
+    def forward(self, x):
+        x1 = F.gelu(torch.matmul(x, self.w1), approximate='tanh') * torch.matmul(x, self.v1)
+        return torch.matmul(x1, self.w2)
diff --git a/tests/layers/moe_test.py b/tests/layers/moe_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffd083f9992bdcd6b418b9d65a0803e5706725c
--- /dev/null
+++ b/tests/layers/moe_test.py
@@ -0,0 +1,199 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from functools import partial
+
+import pytest
+import torch
+
+from megablocks._layers.arguments import Arguments
+from megablocks._layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from megablocks._layers.router import batched_router_zloss, clear_router_zloss
+from tests.layers.architectures import FFN
+
+_FORWARD_TESTS = (
+    (16, 1024, 512, 1, 1),
+    (16, 1024, 512, 2, 1),
+    (16, 1024, 512, 4, 1),
+    (16, 1024, 512, 8, 1),
+    (8, 2048, 512, 1, 1),
+    (8, 2048, 512, 2, 1),
+    (8, 2048, 512, 4, 1),
+    (16, 1024, 512, 2, 2),
+    (16, 1024, 512, 4, 2),
+    (16, 1024, 512, 4, 4),
+    (16, 1024, 512, 8, 2),
+    (16, 1024, 512, 8, 4),
+    (16, 1024, 512, 8, 8),
+)
+
+_DENSE_TESTS = (
+    (16, 1024, 512),
+    (8, 2048, 512),
+)
+
+
+def construct_moe(
+    hidden_size: int,
+    ffn_hidden_size: int,
+    moe_num_experts: int = 1,
+    moe_capacity_factor: int = 1,
+    moe_top_k: int = 1,
+    moe_zloss_weight: float = 0,
+):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    try:
+        import triton
+        if triton.__version__ >= '3.2.0':
+            pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+    except ImportError:
+        pass
+
+    init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
+    args = Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=moe_num_experts,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_top_k=moe_top_k,
+        init_method=init_method,
+        moe_zloss_weight=moe_zloss_weight,
+    )
+
+    mlp = FFN(args)
+    moe_mlp = MoE(args)
+
+    mlp.cuda(torch.cuda.current_device()).half()
+    moe_mlp.cuda(torch.cuda.current_device()).half()
+
+    # Set the baseline parameters to match exactly.
+    if moe_num_experts == 1:
+        with torch.no_grad():
+            mlp.w1.copy_(moe_mlp.experts.mlp.w1.squeeze())
+            mlp.w2.copy_(moe_mlp.experts.mlp.w2.squeeze())
+    return args, mlp, moe_mlp
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward(bs: int, sl: int, hs: int, num_experts: int, top_k: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+
+    _, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    clear_load_balancing_loss()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+
+    out, _ = layer(x)
+    assert out.shape == x.shape
+
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward_with_zloss(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_zloss_weight=1e-3,
+    )
+
+    out, _ = layer(x)
+    assert out.shape == x.shape
+
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    clear_router_zloss()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+
+    expected_out = mlp(x)
+    out, _ = moe_mlp(x)
+    assert out.shape == x.shape == expected_out.shape
+    assert torch.allclose(out, expected_out)
+    clear_load_balancing_loss()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+
+    out, _ = moe_mlp(x)
+    loss = out.sum()
+    loss.backward()
+    w1_grad = moe_mlp.experts.mlp.w1.grad.detach().squeeze()
+    w2_grad = moe_mlp.experts.mlp.w2.grad.detach().squeeze()
+    moe_mlp.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+
+    expected_out = mlp(x)
+    expected_loss = expected_out.sum()
+    expected_loss.backward()
+    expected_w1_grad = mlp.w1.grad.detach()
+    expected_w2_grad = mlp.w2.grad.detach()
+    mlp.zero_grad(set_to_none=True)
+    x.grad = None
+
+    # Verify the gradients match.
+    assert w1_grad.shape == expected_w1_grad.shape
+    assert w2_grad.shape == expected_w2_grad.shape
+    assert torch.allclose(w1_grad, expected_w1_grad)
+    assert torch.allclose(w2_grad, expected_w2_grad)
+    clear_load_balancing_loss()
diff --git a/tests/ops/binned_gather_test.py b/tests/ops/binned_gather_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c165086fd55b4acd90373ffa664a013eb9ad2ad3
--- /dev/null
+++ b/tests/ops/binned_gather_test.py
@@ -0,0 +1,71 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+BINNED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), BINNED_GATHER_TESTS)
+def test_binned_gather(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+
+    def binned_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        ec: int,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((ne, ec, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j] // top_k
+                out[i, j, :] = x[index, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+
+    out = ops.binned_gather(x, indices, bins, ec, top_k)
+    expected_out = binned_gather(x, indices, bins, ec, top_k)
+    assert torch.all(torch.eq(out, expected_out))
diff --git a/tests/ops/binned_scatter_test.py b/tests/ops/binned_scatter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b725700b439032e7e89956a20802427b588f831a
--- /dev/null
+++ b/tests/ops/binned_scatter_test.py
@@ -0,0 +1,87 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+_BINNED_SCATTER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), _BINNED_SCATTER_TESTS)
+def testBinnedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,)).cuda().half()
+
+    x = ops.binned_gather(x, indices, bins, ec, top_k)
+
+    def binned_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        weights = weights.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((sl, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j]
+                scale = weights[index]
+                index //= top_k
+
+                out[index, :] += scale * x[i, j, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+
+    out = ops.binned_scatter(x, indices, weights, bins, top_k)
+    expected_out = binned_scatter(x, indices, weights, bins, top_k)
+
+    # NOTE: We need to check approximate equality because the
+    # scatter reduce uses atomics.
+    assert np.testing.assert_allclose(
+        out.cpu(),
+        expected_out.cpu(),
+        rtol=5e-3,
+    ) is None
diff --git a/tests/ops/cumsum_test.py b/tests/ops/cumsum_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d8b0824e73945ac566972f3c53cff766c38cee4
--- /dev/null
+++ b/tests/ops/cumsum_test.py
@@ -0,0 +1,44 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from megablocks import ops
+
+CUMSUM_TESTS = (
+    (1, 32),
+    (2, 32),
+    (2, 1024),
+    (4, 1024),
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (2, 16384),
+    (4, 16384),
+    (8, 16384),
+    (16, 16384),
+    (32, 16384),
+    (64, 16384),
+    (128, 16384),
+)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_exclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.exclusive_cumsum(x, 1) * x
+    expected_out = (torch.cumsum(x, dim=1) - 1) * x
+    assert torch.all(torch.eq(out, expected_out))
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_inclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.inclusive_cumsum(x, 1)
+    expected_out = torch.cumsum(x, dim=1)
+    assert torch.all(torch.eq(out, expected_out))
diff --git a/tests/ops/histogram_test.py b/tests/ops/histogram_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6d3f23875cbab2647a4a3aeb3679fa06ad5d309
--- /dev/null
+++ b/tests/ops/histogram_test.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from megablocks import ops
+
+_HISTOGRAM_TESTS = (
+    (1, 32, torch.int16, 128),
+    (1, 1024, torch.int16, 128),
+    (1, 16384, torch.int16, 128),
+    (1, 32, torch.int32, 128),
+    (1, 1024, torch.int32, 128),
+    (1, 16384, torch.int32, 128),
+    (1, 32, torch.int64, 128),
+    (1, 1024, torch.int64, 128),
+    (1, 16384, torch.int64, 128),
+    (1, 32, torch.int16, 1024),
+    (1, 1024, torch.int16, 1024),
+    (1, 16384, torch.int16, 1024),
+    (1, 32, torch.int32, 1024),
+    (1, 1024, torch.int32, 1024),
+    (1, 16384, torch.int32, 1024),
+    (1, 32, torch.int64, 1024),
+    (1, 1024, torch.int64, 1024),
+    (1, 16384, torch.int64, 1024),
+    (2, 32, torch.int16, 128),
+    (2, 1024, torch.int16, 128),
+    (2, 16384, torch.int16, 128),
+    (2, 32, torch.int32, 128),
+    (2, 1024, torch.int32, 128),
+    (2, 16384, torch.int32, 128),
+    (2, 32, torch.int64, 128),
+    (2, 1024, torch.int64, 128),
+    (2, 16384, torch.int64, 128),
+    (2, 32, torch.int16, 1024),
+    (2, 1024, torch.int16, 1024),
+    (2, 16384, torch.int16, 1024),
+    (2, 32, torch.int32, 1024),
+    (2, 1024, torch.int32, 1024),
+    (2, 16384, torch.int32, 1024),
+    (2, 32, torch.int64, 1024),
+    (2, 1024, torch.int64, 1024),
+    (2, 16384, torch.int64, 1024),
+    (8, 32, torch.int16, 128),
+    (8, 1024, torch.int16, 128),
+    (8, 16384, torch.int16, 128),
+    (8, 32, torch.int32, 128),
+    (8, 1024, torch.int32, 128),
+    (8, 16384, torch.int32, 128),
+    (8, 32, torch.int64, 128),
+    (8, 1024, torch.int64, 128),
+    (8, 16384, torch.int64, 128),
+    (8, 32, torch.int16, 1024),
+    (8, 1024, torch.int16, 1024),
+    (8, 16384, torch.int16, 1024),
+    (8, 32, torch.int32, 1024),
+    (8, 1024, torch.int32, 1024),
+    (8, 16384, torch.int32, 1024),
+    (8, 32, torch.int64, 1024),
+    (8, 1024, torch.int64, 1024),
+    (8, 16384, torch.int64, 1024),
+)
+
+
+# Override the seed_all fixture in autouse.py because
+# _histc_cuda does not have a deterministic implementation
+@pytest.fixture()
+def seed_all():
+    torch.use_deterministic_algorithms(False)
+    return
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('m', 'n', 'dtype', 'max_val'), _HISTOGRAM_TESTS)
+def test_histogram(m: int, n: int, dtype: torch.dtype, max_val: int):
+    x = torch.randint(0, max_val, (m, n)).cuda().to(dtype)
+
+    out = ops.histogram(x, max_val)
+    expected_out = torch.stack([torch.histc(y, max_val, 0, max_val - 1) for y in torch.split(x, 1)])
+    assert torch.all(torch.eq(out, expected_out))
diff --git a/tests/ops/padded_gather_test.py b/tests/ops/padded_gather_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71980998827f67fa2fa4b187ea3631d8623eb272
--- /dev/null
+++ b/tests/ops/padded_gather_test.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+PADDED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), PADDED_GATHER_TESTS)
+def testPaddedGather(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    def padded_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bin_ids = bin_ids.cpu().numpy()
+        bins = bins.cpu().numpy()
+        padded_bins = padded_bins.cpu().numpy()
+
+        out = np.zeros((padded_bins[-1], hs))
+        in_idx = 0
+        for i, end in enumerate(bins):
+            out_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while in_idx < end:
+                load_idx = indices[in_idx] // top_k
+                out[out_idx, :] = x[load_idx, :]
+                in_idx += 1
+                out_idx += 1
+        return torch.from_numpy(out).cuda().half()
+
+    out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    expected_out = padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    assert torch.all(torch.eq(out, expected_out))
diff --git a/tests/ops/padded_scatter_test.py b/tests/ops/padded_scatter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e80dbb56f34b0ba37052c2bfa550cad403da0dd
--- /dev/null
+++ b/tests/ops/padded_scatter_test.py
@@ -0,0 +1,155 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+PADDED_SCATTER_TESTS = [
+    (4, 2, 2, 2),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 1),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 2),
+]
+
+
+def _to_numpy(x: torch.Tensor) -> np.ndarray:
+    return x.detach().cpu().numpy()
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize((
+    'sl',
+    'hs',
+    'ne',
+    'top_k',
+), PADDED_SCATTER_TESTS)
+def testPaddedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs), requires_grad=True).cuda().half()
+
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,), requires_grad=True).cuda().half()
+
+    # Gather the data to prepare for backwards.
+    x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+    def padded_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.detach().cpu().numpy()
+        indices: np.ndarray = _to_numpy(indices)
+        bin_ids: np.ndarray = _to_numpy(bin_ids)
+        weights: np.ndarray = _to_numpy(weights)
+        bins: np.ndarray = _to_numpy(bins)
+        padded_bins: np.ndarray = _to_numpy(padded_bins)
+
+        out = np.zeros((indices.shape[0] // top_k, hs))
+        out_idx = 0
+        for i in range(len(bins)):
+            in_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while out_idx < end:
+                store_idx = indices[out_idx]
+                scale = weights[store_idx]
+                store_idx //= top_k
+
+                out[store_idx, :] += scale * x[in_idx, :]
+                out_idx += 1
+                in_idx += 1
+        return torch.from_numpy(out).cuda().half()
+
+    out = ops.padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+    expected_out = padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+
+    out.backward(torch.randn_like(out))  # sanity check backward pass
+
+    # NOTE: We need to check approximate equality because the scatter reduce uses atomics.
+    # np.testing.assert_allclose returns `None` if no error and raises an AssertionError if an error exists
+    assert np.testing.assert_allclose(
+        _to_numpy(out),
+        _to_numpy(expected_out),
+        rtol=5e-3,
+    ) is None
diff --git a/tests/ops/replicate_test.py b/tests/ops/replicate_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b747b4b320fdf29bd7c3010323545f34f527d5f0
--- /dev/null
+++ b/tests/ops/replicate_test.py
@@ -0,0 +1,108 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+try:
+    from megablocks._ops import ops as backend  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+from megablocks import ops
+
+
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if not len(x.size()) else x
+
+
+REPLICATE_TESTS = [
+    (8, 1, 1),
+    (8, 2, 1),
+    (8, 4, 1),
+    (8, 8, 1),
+    (8, 2, 2),
+    (8, 4, 2),
+    (8, 8, 2),
+    (8, 2, 4),
+    (8, 4, 4),
+    (8, 8, 4),
+    (8, 2, 8),
+    (8, 4, 8),
+    (8, 8, 8),
+    (16384, 2, 1),
+    (16384, 4, 1),
+    (16384, 8, 1),
+    (16384, 16, 1),
+    (16384, 32, 1),
+    (16384, 64, 1),
+    (16384, 128, 1),
+    (16384, 2, 2),
+    (16384, 4, 2),
+    (16384, 8, 2),
+    (16384, 16, 2),
+    (16384, 32, 2),
+    (16384, 64, 2),
+    (16384, 128, 2),
+    (16384, 2, 4),
+    (16384, 4, 4),
+    (16384, 8, 4),
+    (16384, 16, 4),
+    (16384, 32, 4),
+    (16384, 64, 4),
+    (16384, 128, 4),
+    (16384, 2, 8),
+    (16384, 4, 8),
+    (16384, 8, 8),
+    (16384, 16, 8),
+    (16384, 32, 8),
+    (16384, 64, 8),
+    (16384, 128, 8),
+]
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+
+    def replicate(x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        x = x.cpu().numpy()
+        bins = bins.cpu().numpy()
+        out = np.zeros((x.shape[0], num_outputs))
+        for batch_idx in range(x.shape[0]):
+            start = 0
+            for i, end in enumerate(bins):
+                value = x[batch_idx, i]
+                while start < end:
+                    out[batch_idx, start] = value
+                    start += 1
+        return torch.from_numpy(out).cuda().half()
+
+    out = ops.replicate(center_weights, bins, tokens)
+    expected_out = replicate(center_weights, bins, tokens)
+    assert torch.all(torch.eq(out, expected_out))
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate_backward(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+
+    grad = ops.replicate(center_weights, bins, tokens)
+
+    out = torch.empty_like(center_weights)
+    backend.replicate_backward(grad, bins, out)
+    expected_out = center_weights * tokens_per_center.view([1, num_centers])
+
+    # NOTE: This floating-point reduction could be a problem for training stability and accuracy.
+    assert torch.allclose(out, expected_out, rtol=1e-2)
diff --git a/tests/ops/sort_test.py b/tests/ops/sort_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..147426e3ac68c664b866b08ce62710a55ca86aeb
--- /dev/null
+++ b/tests/ops/sort_test.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Optional, Union
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+SORT_TESTS = [
+    (32, torch.int16, None),
+    (1024, torch.int16, None),
+    (16384, torch.int16, None),
+    (32, torch.int32, None),
+    (1024, torch.int32, None),
+    (16384, torch.int32, None),
+    (32, torch.int64, None),
+    (1024, torch.int64, None),
+    (16384, torch.int64, None),
+    (32, torch.int16, 128),
+    (1024, torch.int16, 128),
+    (16384, torch.int16, 128),
+    (32, torch.int32, 128),
+    (1024, torch.int32, 128),
+    (16384, torch.int32, 128),
+    (32, torch.int64, 128),
+    (1024, torch.int64, 128),
+    (16384, torch.int64, 128),
+]
+
+
+def torch_to_numpy_dtype(dtype: torch.dtype,) -> Union[np.int16, np.int32, np.int64]:
+    types: Dict[torch.dtype, Union[np.int16, np.int32, np.int64]] = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    ('n', 'dtype', 'max_val'),
+    SORT_TESTS,
+)
+def test_sort(n: int, dtype: torch.dtype, max_val: Optional[int]):
+    if max_val is None:
+        max_val = np.iinfo(torch_to_numpy_dtype(dtype)).max
+    end_bit = int(np.ceil(np.log2(max_val)))
+    x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+    out, indices = ops.sort(x, end_bit)
+    expected_out, expected_indices = torch.sort(x)
+    assert torch.all(torch.eq(out, expected_out))
+
+    # NOTE: The indices can be in different order depending
+    # on sort stability if multiple values in the array are
+    # equal.
+    data = torch.empty_like(x)
+    data.scatter_(0, indices.long(), out)
+    expected_data = torch.empty_like(x)
+    expected_data.scatter_(0, expected_indices, expected_out)
+    assert torch.all(torch.eq(data, expected_data))
diff --git a/tests/ops/topology_test.py b/tests/ops/topology_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c0aee108d430751f247a47c8dbdf0138d81f1
--- /dev/null
+++ b/tests/ops/topology_test.py
@@ -0,0 +1,81 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import pytest
+import torch
+
+from megablocks import ops
+
+TOPOLOGY_TESTS = (
+    (1024, 1536, 2),
+    (1024, 1536, 4),
+    (1024, 1536, 8),
+    (1024, 1536, 16),
+    (1024, 1536, 32),
+    (1024, 1536, 64),
+    (1024, 1536, 128),
+    (1024, 1536, 256),
+    (1024, 1536, 512),
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384, 768, 256),
+    (16384, 768, 512),
+    (16384, 768, 1024),
+    (8, 14336, 8),
+)
+
+
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne'), TOPOLOGY_TESTS)
+def test_topology(sl: int, hs: int, ne: int):
+    # Create the data and indices.
+    blocking = 128
+    assert hs % blocking == 0
+
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, blocking)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+
+    # Dimensions for the output indices.
+    output_block_rows = int(padded_bins[-1]) // blocking
+    output_block_columns = hs // blocking
+
+    def topology(
+        padded_bins: torch.Tensor,
+        blocking: torch.Tensor,
+        rows: int,
+        columns: int,
+    ):
+        padded_bins = padded_bins.cpu().numpy()
+
+        out = np.zeros([rows * columns])
+        start = 0
+        for i in range(padded_bins.shape[0]):
+            end = padded_bins[i] // blocking
+            while start < end:
+                for j in range(columns):
+                    out[start * columns + j] = j + i * columns
+                start += 1
+        return torch.from_numpy(out).cuda().short()
+
+    out = ops.topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    expected_out = topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    assert torch.all(torch.eq(out, expected_out))
diff --git a/tests/ops_test.py b/tests/ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94388dba3b8e738d918db2537bab7d13349c2a4
--- /dev/null
+++ b/tests/ops_test.py
@@ -0,0 +1,171 @@
+import torch
+import megablocks
+
+import unittest
+from absl.testing import parameterized
+
+# import itertools
+# import numpy as np
+
+
+def allclose(x, y, pct=2.0):
+    mask = torch.isclose(x, y, rtol=1e-5)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print(x[torch.logical_not(mask)], y[torch.logical_not(mask)])
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+def add_flags(x):
+    out = []
+    for y in x:
+        for trans_b in (False, True):
+            out.append(y + (trans_b, False))
+
+            # TODO: Revisit enabling batch_sizes_on_device
+            # for batch_sizes_on_device in (False, True):
+            #     out.append(y + (trans_b, batch_sizes_on_device))
+    return out
+
+
+_TEST_PROBLEMS = add_flags((
+    (1, 128, 128, 128),
+    (8, 128, 128, 128),
+    (16, 128, 128, 128),
+    (1, 128, 256, 512),
+    (8, 128, 256, 512),
+    (16, 128, 256, 512),
+))
+
+
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start:start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+
+
+@parameterized.parameters(*_TEST_PROBLEMS)
+class OpsTest(parameterized.TestCase):
+
+    def testGroupedGemm_FixedSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+        batch_sizes = torch.tensor([m] * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+
+        # out = ops.gmm(a, b, batch_sizes, trans_b)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        # print("out", out)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+
+    def testGroupedGemm_VariableSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+
+        dist = torch.rand(z, )
+        dist /= dist.sum()
+        batch_sizes = (dist * m).to(torch.long)
+        error = m * z - batch_sizes.sum()
+        batch_sizes[-1] += error
+        assert batch_sizes.sum() == (m * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+
+        # TODO: Review to ensure that the gradients are correct.
+        # self.assertTrue(allclose(b.grad, b_ref.grad))
+
+
+# @parameterized.parameters(False, True)
+@parameterized.parameters(False, False)
+class EdgeCasesTest(unittest.TestCase):
+
+    def testGroupedGemm_ZeroSize(self, batch_sizes_on_device):
+        torch.manual_seed(0)
+        m = 16384
+        k = 4096
+        n = 14336
+        num_experts = 8
+
+        a = randn(num_experts, m // num_experts, k).view(-1, k)
+        b = randn(num_experts, k, n)
+        batch_sizes = torch.tensor([219, 2246, 5, 8103, 1, 1117, 4693, 0]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes)
+        expected_out = gmm(a_ref, b_ref, batch_sizes)
+        self.assertTrue(allclose(out, expected_out))
+
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+
+    def testGroupedGemm_ZeroK(self, batch_sizes_on_device):
+        sz = 128
+        total_tokens = 192
+
+        a = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        b = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        c = torch.ones(4, sz, sz).cuda().to(torch.bfloat16)
+        batch_sizes = torch.tensor([0, 128, 0, 64]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+
+        megablocks.gg_backend.gmm(a, b, batch_sizes, trans_a=True, c=c)
+        self.assertTrue((c[0] == 0).all())
+        self.assertTrue((c[1] == 128).all())
+        self.assertTrue((c[2] == 0).all())
+        self.assertTrue((c[3] == 64).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/parallel_layer_test.py b/tests/parallel_layer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c2d5aa6d3fd04ab49e9564a362d767aefb99ef
--- /dev/null
+++ b/tests/parallel_layer_test.py
@@ -0,0 +1,94 @@
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+
+
+def test_megablocks_moe_mlp_import():
+    from megablocks.layers import MegaBlocksMoeMLP
+
+    assert MegaBlocksMoeMLP is not None, "MegaBlocksMoeMLP import failed."
+
+
+def run_distributed_test(rank, world_size):
+    from megablocks.layers import MegaBlocksMoeMLP
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    expert_parallel_group = torch.distributed.new_group(
+        range(torch.distributed.get_world_size())
+    )
+
+    model = MegaBlocksMoeMLP()
+    model.expert_parallel_group = expert_parallel_group
+
+    class Experts:
+        def __init__(self):
+            self.gate_up_proj = None
+            self.gate_up_proj_bias = None
+            self.down_proj = None
+            self.down_proj_bias = None
+            self.hidden_size = None
+
+    model.experts = Experts()
+
+    num_experts = 128
+    hidden_size = 1152
+    intermediate_size = 3072
+
+    ne, hs, isz = num_experts, hidden_size, intermediate_size
+
+    experts_per_rank = ne // world_size
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    model.router = torch.nn.Linear(hs, ne).to(device)
+    model.router.weight.data.fill_(1)
+
+    e = model.experts
+    e.gate_up_proj = torch.nn.Parameter(
+        torch.ones(experts_per_rank, hs, isz, device=device)
+    )
+    e.gate_up_proj_bias = torch.nn.Parameter(
+        torch.zeros(experts_per_rank, isz, device=device)
+    )
+    e.down_proj = torch.nn.Parameter(
+        torch.ones(experts_per_rank, 1536, hs, device=device)
+    )
+    e.down_proj_bias = torch.nn.Parameter(
+        torch.zeros(experts_per_rank, hs, device=device)
+    )
+    e.hidden_size = hs
+
+    x = torch.randn(1, 1, 1152).to(device)
+    output, expert_weights_out = model(x)
+
+    assert output.shape == (1, 1, 1152), f"Output shape mismatch on rank {rank}."
+
+    print(f"Rank {rank}: Test passed! Output shape: {output.shape}")
+
+    dist.destroy_process_group()
+
+
+def test_megablocks_moe_mlp_functionality():
+    world_size = 2
+
+    mp.spawn(run_distributed_test, args=(world_size,), nprocs=world_size, join=True)
+
+    print("Multi-process test completed successfully!")
+
+
+if __name__ == "__main__":
+    test_megablocks_moe_mlp_import()
+    print("Import test passed!")
+
+    test_megablocks_moe_mlp_functionality()
diff --git a/tests/test_gg.py b/tests/test_gg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b2abbaab9034b1e3b38a21a7fba44a0273023b
--- /dev/null
+++ b/tests/test_gg.py
@@ -0,0 +1,57 @@
+import torch
+import megablocks
+
+
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start : start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+
+
+def test_gmm():
+    z = 1
+    m = 128
+    n = 128
+    k = 128
+    trans_b = False
+    batch_sizes_on_device = False
+    # TODO: fix to enable batch_sizes_on_device
+    # batch_sizes_on_device = True
+
+    torch.manual_seed(0)
+    a = randn(z, m, k).view(-1, k)
+    b = randn(z, n, k) if trans_b else randn(z, k, n)
+    batch_sizes = torch.tensor([m] * z)
+    if batch_sizes_on_device:
+        batch_sizes = batch_sizes.cuda()
+
+    a.requires_grad_(True)
+    b.requires_grad_(True)
+    a_ref = a.detach().clone().requires_grad_(True)
+    b_ref = b.detach().clone().requires_grad_(True)
+
+    # out = ops.gmm(a, b, batch_sizes, trans_b)
+    out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+    print("out", out)
+
+    expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+
+    assert torch.allclose(out, expected_out, atol=1e-3), f"Expected {expected_out}, got {out}"
+
+    out.sum().backward()
+
+    expected_out.sum().backward()
+    assert torch.allclose(a.grad, a_ref.grad, atol=1e-3), f"Expected {a_ref.grad}, got {a.grad}"
+    assert torch.allclose(b.grad, b_ref.grad, atol=1e-3), f"Expected {b_ref.grad}, got {b.grad}"
+    print("Test passed successfully!")
\ No newline at end of file
diff --git a/tests/test_mb_moe.py b/tests/test_mb_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..3686ec21b251d2fc3ff694d28f2f1246ac75a16e
--- /dev/null
+++ b/tests/test_mb_moe.py
@@ -0,0 +1,48 @@
+import torch
+import megablocks
+
+def test_import():
+    """Simple test to check if the module can be imported."""
+    print("megablocks_moe module imported successfully.")
+    print("Available functions:", dir(megablocks))
+
+    expected_functions = [
+        "Arguments", "MLP", "MoE", "ParallelDroplessMLP", "ParallelMLP",
+        "SparseGLU", "SparseMLP", "argsort",
+        "backend", "cumsum", "dMoE", "exclusive_cumsum",
+        "get_load_balancing_loss", "grouped_gemm_util", "histogram",
+        "inclusive_cumsum", "indices", "layers", "ops", "replicate_backward",
+        "replicate_forward", "sort", "torch"
+    ]
+
+    # Check if all expected functions are available
+    for func in expected_functions:
+        assert func in dir(megablocks), f"Missing function: {func}" 
+
+# exclusive_cumsum
+def test_exclusive_cumsum():
+    """Test exclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.exclusive_cumsum(x, 0, out)
+    expected = torch.tensor([0, 1, 3, 6], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+    print("cumsum output:", out)
+
+# inclusive_cumsum
+def test_inclusive_cumsum():
+    """Test inclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.inclusive_cumsum(x, dim=0, out=out)
+    expected = torch.tensor([1, 3, 6, 10], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+
+# histogram
+def test_histogram():
+    """Test histogram operation."""
+    x = torch.tensor([0, 1, 1, 2, 2, 2], dtype=torch.int16).cuda()
+    num_bins = 3
+    hist = megablocks.histogram(x, num_bins)
+    expected_hist = torch.tensor([1, 2, 3], dtype=torch.int32).cuda()
+    assert torch.equal(hist, expected_hist), f"Expected {expected_hist}, got {hist}"
diff --git a/tests/test_mb_moe_shared_expert.py b/tests/test_mb_moe_shared_expert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9eba9fafe7f5031c4714f637207b5ab636600fd
--- /dev/null
+++ b/tests/test_mb_moe_shared_expert.py
@@ -0,0 +1,139 @@
+import torch
+import megablocks
+from megablocks.layers import MegaBlocksMoeMLPWithSharedExpert, create_shared_expert_weights
+
+
+def test_megablocks_moe_mlp_with_shared_expert_import():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    assert hasattr(mlp, 'shared_up_proj_weight')
+    assert hasattr(mlp, 'shared_down_proj_weight')
+    assert hasattr(mlp, 'set_shared_expert_weights')
+
+
+def test_set_shared_expert_weights():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float32
+    
+    up_proj_weight = torch.randn(shared_expert_hidden_size, hidden_size, device=device, dtype=dtype)
+    down_proj_weight = torch.randn(hidden_size, shared_expert_hidden_size, device=device, dtype=dtype)
+    up_proj_bias = torch.randn(shared_expert_hidden_size, device=device, dtype=dtype)
+    down_proj_bias = torch.randn(hidden_size, device=device, dtype=dtype)
+    
+    mlp.set_shared_expert_weights(
+        up_proj_weight=up_proj_weight,
+        down_proj_weight=down_proj_weight,
+        up_proj_bias=up_proj_bias,
+        down_proj_bias=down_proj_bias,
+        weighted_sum=True,
+        activation_fn=torch.nn.functional.gelu
+    )
+    
+    assert torch.equal(mlp.shared_up_proj_weight, up_proj_weight)
+    assert torch.equal(mlp.shared_down_proj_weight, down_proj_weight)
+    assert torch.equal(mlp.shared_up_proj_bias, up_proj_bias)
+    assert torch.equal(mlp.shared_down_proj_bias, down_proj_bias)
+    assert mlp.shared_expert_weighted_sum == True
+    assert mlp.shared_activation_fn == torch.nn.functional.gelu
+
+
+def test_create_shared_expert_weights():
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float32
+    
+    def init_method(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+    
+    up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+        init_method=init_method
+    )
+    
+    assert up_proj_weight.shape == (shared_expert_hidden_size, hidden_size)
+    assert down_proj_weight.shape == (hidden_size, shared_expert_hidden_size)
+    assert up_proj_weight.device.type == device.type
+    assert down_proj_weight.device.type == device.type
+    assert up_proj_weight.dtype == dtype
+    assert down_proj_weight.dtype == dtype
+    assert up_proj_bias is None
+    assert down_proj_bias is None
+
+
+def test_shared_expert_weights_none_by_default():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    
+    assert mlp.shared_up_proj_weight is None
+    assert mlp.shared_down_proj_weight is None
+    assert mlp.shared_up_proj_bias is None
+    assert mlp.shared_down_proj_bias is None
+    assert mlp.shared_expert_weighted_sum == False
+    assert mlp.shared_activation_fn is None
+
+
+def test_inheritance_from_megablocks_moe_mlp():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    
+    from megablocks.layers import MegaBlocksMoeMLP
+    assert isinstance(mlp, MegaBlocksMoeMLP)
+    assert hasattr(mlp, 'forward')
+
+
+def test_shared_expert_weights_custom_init():
+    hidden_size = 64
+    shared_expert_hidden_size = 128
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float16
+    
+    def custom_init(tensor):
+        torch.nn.init.constant_(tensor, 0.5)
+    
+    def custom_output_init(tensor):
+        torch.nn.init.constant_(tensor, 0.1)
+    
+    up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+        init_method=custom_init,
+        output_layer_init_method=custom_output_init
+    )
+    
+    assert torch.all(up_proj_weight == 0.5)
+    assert torch.all(down_proj_weight == 0.1)
+    assert up_proj_weight.dtype == dtype
+    assert down_proj_weight.dtype == dtype
+
+
+def test_shared_expert_weights_dimensions():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    
+    batch_size = 4
+    seq_len = 16
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+    up_proj_weight = torch.randn(shared_expert_hidden_size, hidden_size, device=device)
+    down_proj_weight = torch.randn(hidden_size, shared_expert_hidden_size, device=device)
+    
+    mlp.set_shared_expert_weights(
+        up_proj_weight=up_proj_weight,
+        down_proj_weight=down_proj_weight
+    )
+    
+    x = torch.randn(seq_len, batch_size, hidden_size, device=device)
+    
+    expected_up_output_shape = (seq_len, batch_size, shared_expert_hidden_size)
+    expected_down_output_shape = (seq_len, batch_size, hidden_size)
+    
+    assert up_proj_weight.shape[1] == x.shape[-1]
+    assert down_proj_weight.shape[0] == x.shape[-1]
\ No newline at end of file
diff --git a/tests/test_mb_moe_shared_expert_multi.py b/tests/test_mb_moe_shared_expert_multi.py
new file mode 100644
index 0000000000000000000000000000000000000000..e67d42db94ed7a24b82cd5a070b919c2ea51807a
--- /dev/null
+++ b/tests/test_mb_moe_shared_expert_multi.py
@@ -0,0 +1,200 @@
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+import pytest
+from megablocks.layers import MegaBlocksMoeMLPWithSharedExpert, create_shared_expert_weights
+
+
+def run_distributed_shared_expert_test(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12356"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    model = MegaBlocksMoeMLPWithSharedExpert()
+
+    hidden_size = 128
+    shared_expert_hidden_size = 192
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def simple_init(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+
+    shared_up_proj_weight, shared_down_proj_weight, shared_up_proj_bias, shared_down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=torch.device(device),
+        dtype=torch.float32,
+        init_method=simple_init
+    )
+
+    model.set_shared_expert_weights(
+        up_proj_weight=shared_up_proj_weight,
+        down_proj_weight=shared_down_proj_weight,
+        up_proj_bias=shared_up_proj_bias,
+        down_proj_bias=shared_down_proj_bias,
+        weighted_sum=True,
+        activation_fn=torch.nn.functional.gelu
+    )
+
+    assert model.shared_up_proj_weight is not None, f"Shared up proj weight not set on rank {rank}"
+    assert model.shared_down_proj_weight is not None, f"Shared down proj weight not set on rank {rank}"
+    assert model.shared_expert_weighted_sum == True, f"Weighted sum not set correctly on rank {rank}"
+    
+    print(f"Rank {rank}: Shared expert setup test passed!")
+
+    dist.destroy_process_group()
+
+
+def run_distributed_shared_expert_weighted_sum_test(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12357"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    model = MegaBlocksMoeMLPWithSharedExpert()
+
+    hidden_size = 64
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def simple_init(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+
+    shared_up_proj_weight, shared_down_proj_weight, _, _ = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=96,
+        device=torch.device(device),
+        dtype=torch.float32,
+        init_method=simple_init
+    )
+
+    model.set_shared_expert_weights(
+        up_proj_weight=shared_up_proj_weight,
+        down_proj_weight=shared_down_proj_weight,
+        weighted_sum=False,
+        activation_fn=torch.nn.functional.relu
+    )
+
+    assert model.shared_up_proj_weight is not None, f"Shared up proj weight not set on rank {rank}"
+    assert model.shared_down_proj_weight is not None, f"Shared down proj weight not set on rank {rank}"
+    assert model.shared_expert_weighted_sum == False, f"Weighted sum not set correctly on rank {rank}"
+    assert model.shared_activation_fn == torch.nn.functional.relu, f"Activation function not set correctly on rank {rank}"
+    
+    print(f"Rank {rank}: Weighted sum setup test passed!")
+
+    dist.destroy_process_group()
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+def test_shared_expert_distributed_functionality(world_size):
+    if world_size == 1:
+        # Single process test
+        model = MegaBlocksMoeMLPWithSharedExpert()
+        
+        hidden_size = 128
+        shared_expert_hidden_size = 192
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        def simple_init(tensor):
+            torch.nn.init.xavier_uniform_(tensor)
+
+        shared_up_proj_weight, shared_down_proj_weight, shared_up_proj_bias, shared_down_proj_bias = create_shared_expert_weights(
+            hidden_size=hidden_size,
+            shared_expert_hidden_size=shared_expert_hidden_size,
+            device=torch.device(device),
+            dtype=torch.float32,
+            init_method=simple_init
+        )
+
+        model.set_shared_expert_weights(
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            weighted_sum=True,
+            activation_fn=torch.nn.functional.gelu
+        )
+
+        assert model.shared_up_proj_weight is not None, "Shared up proj weight not set"
+        assert model.shared_down_proj_weight is not None, "Shared down proj weight not set"
+        assert model.shared_expert_weighted_sum == True, "Weighted sum not set correctly"
+        
+        print("Single process shared expert setup test passed!")
+    else:
+        # Multi-process test
+        mp.spawn(run_distributed_shared_expert_test, args=(world_size,), nprocs=world_size, join=True)
+        print("Multi-process shared expert test completed successfully!")
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+def test_shared_expert_distributed_weighted_sum(world_size):
+    if world_size == 1:
+        # Single process test
+        model = MegaBlocksMoeMLPWithSharedExpert()
+
+        hidden_size = 64
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        def simple_init(tensor):
+            torch.nn.init.xavier_uniform_(tensor)
+
+        shared_up_proj_weight, shared_down_proj_weight, _, _ = create_shared_expert_weights(
+            hidden_size=hidden_size,
+            shared_expert_hidden_size=96,
+            device=torch.device(device),
+            dtype=torch.float32,
+            init_method=simple_init
+        )
+
+        model.set_shared_expert_weights(
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            weighted_sum=False,
+            activation_fn=torch.nn.functional.relu
+        )
+
+        assert model.shared_up_proj_weight is not None, "Shared up proj weight not set"
+        assert model.shared_down_proj_weight is not None, "Shared down proj weight not set"
+        assert model.shared_expert_weighted_sum == False, "Weighted sum not set correctly"
+        assert model.shared_activation_fn == torch.nn.functional.relu, "Activation function not set correctly"
+        
+        print("Single process weighted sum setup test passed!")
+    else:
+        # Multi-process test
+        mp.spawn(run_distributed_shared_expert_weighted_sum_test, args=(world_size,), nprocs=world_size, join=True)
+        print("Multi-process shared expert weighted sum test completed successfully!")
+
+
+def test_shared_expert_single_process():
+    model = MegaBlocksMoeMLPWithSharedExpert()
+    
+    assert model.shared_up_proj_weight is None
+    assert model.shared_down_proj_weight is None
+    assert hasattr(model, 'set_shared_expert_weights')
+    
+    print("Single process shared expert basic test passed!")
+
+
+if __name__ == "__main__":
+    test_shared_expert_single_process()
+    print("Single process test passed!")
+    
+    os.environ['WORLD_SIZE'] = '2'
+    test_shared_expert_distributed_functionality()
+    print("Distributed functionality test passed!")
+    
+    test_shared_expert_distributed_weighted_sum()
+    print("Distributed weighted sum test passed!")
\ No newline at end of file
diff --git a/torch-ext/megablocks/__init__.py b/torch-ext/megablocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38075732c6d8fa0e1e6ef493145e1aca3851ae6b
--- /dev/null
+++ b/torch-ext/megablocks/__init__.py
@@ -0,0 +1,202 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from ._ops import ops
+
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+
+
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+
+from . import layers
+
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+
+
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+
+
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+
+
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+
+
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+
+
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+
+
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+
+
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+
+
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]
diff --git a/torch-ext/megablocks/_layers/__init__.py b/torch-ext/megablocks/_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a720e7a2cc4e44636f6e433a2750e945dc38e8b2
--- /dev/null
+++ b/torch-ext/megablocks/_layers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]
diff --git a/torch-ext/megablocks/_layers/activation_fn.py b/torch-ext/megablocks/_layers/activation_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d956704840aa4daf7d1d71d24e051567feab9
--- /dev/null
+++ b/torch-ext/megablocks/_layers/activation_fn.py
@@ -0,0 +1,33 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Union
+
+import torch
+from ..stk import Matrix
+
+
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y
diff --git a/torch-ext/megablocks/_layers/all_to_all.py b/torch-ext/megablocks/_layers/all_to_all.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac7067bcaa34db1d82b340c43550fe3577aa7a3
--- /dev/null
+++ b/torch-ext/megablocks/_layers/all_to_all.py
@@ -0,0 +1,54 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+
+class AllToAllOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+
+
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )
diff --git a/torch-ext/megablocks/_layers/arguments.py b/torch-ext/megablocks/_layers/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db9b1bd38bc2e2f421625c124f86b85f45c5ae0
--- /dev/null
+++ b/torch-ext/megablocks/_layers/arguments.py
@@ -0,0 +1,101 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
+
+# Type annotation for in-place Tensor initialization function.
+InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]
+
+_ALLOWED_BITWIDTHS = (-1, 4, 8)
+
+DEFAULT_ACTIVATION_FN = partial(F.gelu, approximate='tanh')
+
+
+@dataclasses.dataclass
+class Arguments:
+    # Model arguments.
+    hidden_size: int = 1024
+    ffn_hidden_size: int = 4096
+    num_layers: int = 1
+    bias: bool = True
+    return_bias: bool = True
+    activation_fn: Optional[Callable] = DEFAULT_ACTIVATION_FN
+
+    # MoE arguments.
+    moe_num_experts: int = 1
+    moe_top_k: int = 1
+    moe_capacity_factor: int = 1
+    moe_normalize_expert_weights: Optional[Union[int, float]] = None
+    moe_loss_weight: float = 0.1
+    moe_jitter_eps: Optional[float] = None
+    moe_lbl_in_fp32: bool = False
+
+    # Parallelism arguments.
+    moe_expert_model_parallelism: bool = False
+    expert_parallel_group: Optional[dist.ProcessGroup] = None
+    pipeline_model_parallel_size: int = 1
+    num_layers_per_virtual_pipeline_stage: Optional[int] = None
+
+    # Compute arguments.
+    memory_optimized_mlp: bool = False
+    mlp_type: str = 'mlp'
+    mlp_impl: str = 'sparse'
+
+    # Initialization arguments.
+    fp16: bool = True
+    bf16: bool = False
+    device: Union[int, torch.device] = dataclasses.field(default_factory=torch.cuda.current_device)
+    init_method: InitFn = partial(torch.nn.init.normal_, mean=0.0, std=0.02)
+    output_layer_init_method: InitFn = init_method
+
+    # Benchmarking arguments.
+    uniform_expert_assignment: bool = False
+
+    # shared expert arguments
+    shared_expert: bool = False  # enable using shared expert
+    fc_cls: Any = torch.nn.Linear  # class of the fully connected layer in shared expert (purpose: to allow using custom FC layer eg te.Linear (for FP8))
+    fc_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict,)  # kwargs for custom fc layers
+    remat_act_fn: bool = True  # enable act fn to be rematerialized instead of stored
+    shared_expert_hidden_size: Optional[
+        int] = None  # hidden size of the shared expert IF we want to set it to something different from hidden_size
+    shared_expert_weighted_sum: bool = False  # enable using weighted sum for shared expert output (wieghted by number of experts used)
+
+    # Router Z-loss arguments
+    moe_zloss_weight: float = 0  # 1e-3 is a reasonable value
+    moe_zloss_in_fp32: bool = False
+
+    def __post_init__(self):
+        # Sparse MLP is not supported with triton >=3.2.0
+        # TODO: Remove this once sparse is supported with triton >=3.2.0
+        if self.__getattribute__('mlp_impl') == 'sparse':
+            try:
+                import triton
+                if triton.__version__ >= '3.2.0':
+                    raise ValueError(
+                        'Sparse MLP is not supported with triton >=3.2.0. Please use mlp_impl="grouped" instead.',
+                    )
+            except ImportError:
+                raise ImportError('Triton is required for sparse MLP implementation')
+
+        if self.__getattribute__('mlp_impl') == 'grouped':
+            grouped_gemm.assert_grouped_gemm_is_available()
+
+        if self.shared_expert_hidden_size is None:
+            self.shared_expert_hidden_size = self.ffn_hidden_size
+
+
+def from_megatron(megatron_args: Any):
+    args = Arguments()
+    for field in dataclasses.fields(args):
+        if hasattr(megatron_args, field.name):
+            setattr(args, field.name, getattr(megatron_args, field.name))
+    return args
diff --git a/torch-ext/megablocks/_layers/common.py b/torch-ext/megablocks/_layers/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d07109702963ba48a3b94ab860807954dfd79c1
--- /dev/null
+++ b/torch-ext/megablocks/_layers/common.py
@@ -0,0 +1,26 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from .arguments import Arguments
+
+
+def dtype(args: Arguments):
+    if args.fp16:
+        return torch.float16
+    elif args.bf16:
+        return torch.bfloat16
+    return None
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/torch-ext/megablocks/_layers/dmlp_registry.py b/torch-ext/megablocks/_layers/dmlp_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2ed047042e438c7190ebb139b6f7f30009734c
--- /dev/null
+++ b/torch-ext/megablocks/_layers/dmlp_registry.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from . import glu, mlp
+from .arguments import Arguments
+
+MlpType = Union[mlp.SparseMLP, glu.SparseGLU]
+
+_REGISTRY = {
+    'mlp': {
+        'grouped': mlp.GroupedMLP,
+        'sparse': mlp.SparseMLP,
+    },
+    'glu': {
+        'grouped': glu.GroupedGLU,
+        'sparse': glu.SparseGLU,
+    },
+}
+
+
+def get(args: Arguments) -> MlpType:
+    """Returns an MLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    MLP instance. This only contains MLPs for use in dMoEs
+    (ie. only for the dropless versions of MoEs).
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated MLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    if args.mlp_impl not in _REGISTRY[args.mlp_type]:
+        raise ValueError(f'{args.mlp_type} does not support {args.mlp_impl} backend.',)
+
+    return _REGISTRY[args.mlp_type][args.mlp_impl](args)
diff --git a/torch-ext/megablocks/_layers/dmoe.py b/torch-ext/megablocks/_layers/dmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0375a4df2f27134c4127e60be04f3b45693050
--- /dev/null
+++ b/torch-ext/megablocks/_layers/dmoe.py
@@ -0,0 +1,337 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/dmoe.py is needed.',
+#     )
+
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+
+from .. import stk
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
+
+def promote_scalar(x):
+    return x.view(1) if not len(x.size()) else x
+
+
+class ParallelDroplessMLP(moe.ParallelMLP):
+
+    def __init__(self, args: Arguments):
+        super(ParallelDroplessMLP, self).__init__(args)
+        self.hidden_size = args.hidden_size
+        self.ffn_hidden_size = mpu.features_per_rank(args)
+        self.blocking = 128
+        self.mlp = dmlp_registry.get(args)
+
+        # Calculate the number of bits needed to represent the column indices
+        # in the intermediate sparse matrix.
+        max_column_index = ((self.ffn_hidden_size * self.num_experts) // self.blocking)
+        self.transpose_sort_end_bit = max(
+            int(np.ceil(np.log2(max_column_index))),
+            1,
+        )
+
+    def sparse_transpose(self, size, row_indices, column_indices, offsets):
+        block_columns = size[1] // self.blocking
+
+        # Sort row indices by column indices to get the transposed matrix's
+        # column indices.
+        #
+        # NOTE: Our sort operation uses the same width indices as the input values.
+        # To avoid overflow when we have large activation matrices we cast to
+        # 32-bit before sorting.
+        _, gather_indices = ops.sort(
+            column_indices.int(),
+            self.transpose_sort_end_bit,
+        )
+
+        # There are a constant number of blocks in every row of the sparse matrix.
+        # A blocks offset is:
+        #
+        # row_index * blocks_per_row + column_index % blocks_per_row
+        #
+        # Once we have the block offsets ordered for transposition we can divide
+        # by blocks_per_row to get the transposed column indices.
+        column_indices_t = row_indices.gather(0, gather_indices.long())
+        block_offsets_t = gather_indices.int()
+
+        zero = torch.zeros((1,), dtype=torch.int32, device=row_indices.device)
+        nnz_per_column = ops.histogram(column_indices, block_columns)
+        nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0)
+        if nnz_per_column.dim() == 0:
+            # This addresses an edge case when ffn_hidden_size is equal to self.blocking.
+            nnz_per_column = nnz_per_column.unsqueeze(0)
+        offsets_t = torch.cat([zero, nnz_per_column])
+        return column_indices_t, offsets_t, block_offsets_t
+
+    def topology(self, x, padded_bins):
+        padded_tokens, _ = x.size()
+        assert padded_tokens % self.blocking == 0
+        if self.ffn_hidden_size % self.blocking != 0:
+            raise ValueError(
+                f'The ffn_hidden_size {self.ffn_hidden_size} must be divisible by ' +
+                f'the block size {self.blocking}. Please update your configuration.',
+            )
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // self.blocking
+        blocks_per_row = self.ffn_hidden_size // self.blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            self.blocking,
+            block_rows,
+            blocks_per_row,
+        )
+
+        # TODO(tgale): This is unused. Remove the need for this in stk.
+        # For now, use meta init to save the device memory.
+        data = torch.empty(
+            column_indices.numel(),
+            self.blocking,
+            self.blocking,
+            dtype=common.dtype(self.args),
+            device='meta',
+        )
+        shape = (
+            padded_tokens,
+            self.ffn_hidden_size * mpu.experts_per_rank(self.args),
+        )
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose(
+            shape,
+            row_indices,
+            column_indices,
+            offsets,
+        )
+        return stk.Matrix(
+            shape,
+            data,
+            row_indices,
+            column_indices,
+            offsets,
+            column_indices_t,
+            offsets_t,
+            block_offsets_t,
+        )
+
+    def indices_and_padded_bins(self, top_experts):
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        top_experts = top_experts.int()
+        bin_ids, indices = ops.sort(top_experts, self.sort_end_bit)
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        tokens_per_expert = ops.histogram(top_experts, self.num_experts)
+
+        # Round the token counts up to the block size used in
+        # the matrix muliplications. Caculate the starting
+        # position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Calculate the bin bounds for the sorted tokens.
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = promote_scalar(bins)
+        return indices, bin_ids, bins, padded_bins, tokens_per_expert
+
+    def sparse_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, padded_bins, tokens_per_expert = (self.indices_and_padded_bins(top_experts))
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        x = ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    # For use in the base-class parallel_forward_once.
+    def sparse_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Round the token counts up to the block size used in the matrix
+        # multiplication. Calculate the starting position of each bin.
+        padded_tokens_per_expert = ops.round_up(
+            tokens_per_expert,
+            self.blocking,
+        )
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        padded_bins = promote_scalar(padded_bins)
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        # Create the sparse matrix topology.
+        with torch.no_grad():
+            topo = self.topology(x, padded_bins)
+
+        # Perform the expert computation.
+        x = self.mlp(x, topo)
+
+        # Un-route the data for the MoE output.
+        return ops.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    def grouped_forward_once(self, x, expert_weights, top_experts):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+        out = self.grouped_permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            -1,  # unused
+            self.args.moe_top_k,
+        )
+        return out, tokens_per_expert
+
+    def grouped_permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,  # unused
+        top_k,
+    ):
+
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+        # Perform the expert computation.
+        x = self.mlp(x, tokens_per_expert)
+
+        # Un-route the data for the MoE output.
+        return ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    def forward_once(self, x, expert_weights, top_experts):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_forward_once(x, expert_weights, top_experts)
+        else:
+            return self.grouped_forward_once(x, expert_weights, top_experts)
+
+    def permute_and_compute(
+        self,
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capactiy,
+        top_k,
+    ):
+        if self.args.mlp_impl == 'sparse':
+            return self.sparse_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+        else:
+            return self.grouped_permute_and_compute(
+                x,
+                tokens_per_expert,
+                indices,
+                bin_ids,
+                expert_weights,
+                bins,
+                expert_capactiy,
+                top_k,
+            )
+
+
+class dMoE(moe.MoE):
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelDroplessMLP(args)
diff --git a/torch-ext/megablocks/_layers/gelu.py b/torch-ext/megablocks/_layers/gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c9e6532798615b5c12c96694241a4c18ee8f7b
--- /dev/null
+++ b/torch-ext/megablocks/_layers/gelu.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/gelu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def _gelu_backward_inplace(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    ff = (0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out))
+    return g.mul_(ff)
+
+
+def gelu_backward_(grad: stk.Matrix, x: stk.Matrix):
+    # NOTE: The two sparse matrices must have the same topology.
+    if isinstance(grad, stk.Matrix) and isinstance(x, stk.Matrix):
+        return stk.Matrix(
+            x.size(),
+            _gelu_backward_inplace(grad.data, x.data),
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+    return _gelu_backward_inplace(grad, x)
+
+
+def gelu(x: stk.Matrix):
+    assert isinstance(x, stk.Matrix)
+    return stk.Matrix(
+        x.size(),
+        F.gelu(x.data, approximate='tanh'),
+        x.row_indices,
+        x.column_indices,
+        x.offsets,
+        x.column_indices_t,
+        x.offsets_t,
+        x.block_offsets_t,
+    )
diff --git a/torch-ext/megablocks/_layers/glu.py b/torch-ext/megablocks/_layers/glu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f297a41ff6a1a2a285f5b461951672364b898da
--- /dev/null
+++ b/torch-ext/megablocks/_layers/glu.py
@@ -0,0 +1,244 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+# import stk.ops
+# try:
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/glu.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
+    SharedMLP,
+    SparseMLP,
+    create_dmoe_expert_weights,
+    resolve_dtensor,
+)
+
+
+class SparseGLU(SparseMLP):
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        with torch.no_grad():
+            self.v1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+
+        mpu.set_expert_model_parallel_attributes(
+            self.v1,
+            self._should_set_parallelism_attribute,
+        )
+
+    def forward(self, x, topo):
+        if self.args.memory_optimized_mlp:
+            raise NotImplementedError(
+                'Memory optimized implementation not yet supported with GLU with sparse kernels.',
+            )
+
+        w1, v1, w2 = self.scale_grad(self.w1), self.scale_grad(self.v1,), self.scale_grad(self.w2)
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Compute the GLU.
+        x1 = stk.ops.sdd(x, w1.t(), topo)
+        x2 = stk.ops.sdd(x, v1.t(), topo)
+
+        activation_fn_out = act_fn(x1, self.args.activation_fn)
+        x1 = stk.ops.mul(activation_fn_out, x2)
+
+        return stk.ops.dsd(x1, w2)
+
+
+class MemoryOptimizedGroupedGLU(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, v1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            v1 = v1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], v1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not v1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1', 'v1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+        v1_out = gg.backend.gmm(x, v1, batch_sizes, trans_b=True)
+
+        # GeLU.
+        activation_fn_out = activation_fn(sdd_out) * v1_out
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, v1, w2, batch_sizes, x, sdd_out, v1_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, v1, w2 = saved_tensors[:3]
+        batch_sizes = saved_tensors[3]
+        x = saved_tensors[4]
+        sdd_out, v1_out = saved_tensors[5:7]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            v1_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out) * v1_out
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        assert activation_grad_fn is not None
+        activation_grad_fn(dactivation_fn_out)
+        dsdd_out = sdd_out.grad
+        dv1_out = v1_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dv1.
+        dv1 = gg.backend.gmm(dv1_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        dx = ddsd_out
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=dx)
+        dx += gg.backend.gmm(dv1_out, v1, batch_sizes)
+        return dx, dw1, dv1, dw2, None, None
+
+
+memory_optimized_grouped_glu = MemoryOptimizedGroupedGLU.apply
+
+
+class GroupedGLU(SparseGLU):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, v1, w2 = (
+            self.scale_grad(self.w1),
+            self.scale_grad(self.v1),
+            self.scale_grad(self.w2),
+        )
+        w1, v1, w2 = resolve_dtensor(w1), resolve_dtensor(v1,), resolve_dtensor(w2)
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = w1.view(ne, -1, self.args.hidden_size)
+        v1 = v1.view(ne, -1, self.args.hidden_size)
+        w2 = w2.view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_glu(
+                x,
+                w1,
+                v1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
+        x1 = self.args.activation_fn(x1) * x2
+        return gg.ops.gmm(x1, w2, batch_sizes)
+
+
+class SharedGLU(SharedMLP):
+    """GPU for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTGLU class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.gate_proj = args.fc_cls(
+            args.hidden_size,
+            self.args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
diff --git a/torch-ext/megablocks/_layers/memory_test.py b/torch-ext/megablocks/_layers/memory_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d1166931b712635131985b25a89f4ca23e576d
--- /dev/null
+++ b/torch-ext/megablocks/_layers/memory_test.py
@@ -0,0 +1,103 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
+
+_TESTS = ((8, 2048, 4096, 4096, 32, 4),)
+
+
+def get_tensors():
+    ptrs = set()
+    out = []
+    for obj in gc.get_objects():
+        if torch.is_tensor(obj):
+            if not obj.is_contiguous() or obj.data_ptr() in ptrs:
+                continue
+            out.append(obj)
+            ptrs.add(obj.data_ptr())
+    return out
+
+
+def test_memory(
+    group,
+    batch_size,
+    sequence_length,
+    hidden_size,
+    ffn_hidden_size,
+    num_experts,
+    top_k,
+):
+    args = arguments.Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_expert_model_parallelism=True,
+        expert_parallel_group=group,
+        fp16=False,
+        bf16=True,
+        device=torch.cuda.current_device(),
+    )
+    layer = dmoe.dMoE(args).cuda()
+
+    x = torch.randn((batch_size, sequence_length, hidden_size),
+                    device=torch.cuda.current_device(),
+                    dtype=torch.bfloat16).requires_grad_(True)
+    torch.cuda.empty_cache()
+
+    # Run forward + backward.
+    # with torch.autograd.detect_anomaly():
+    out, _ = layer(x)
+    out.mean().backward()
+
+    # Report peak memory.
+    mem = torch.cuda.max_memory_allocated()
+    print('Max Memory Allocated = {:0.0f}MiB'.format(mem / 1e6))
+    print('Max Memory Reserved = {:0.0f}MiB'.format(torch.cuda.max_memory_reserved() / 1e6,),)
+
+    # Calculate weight and gradient memory usage.
+    weight_memory = 2 * (
+        layer.router.layer.weight.numel() + layer.experts.mlp.w1.numel() + layer.experts.mlp.w2.numel()
+    )
+
+    def grad_numel(x):
+        if x.grad is not None:
+            return x.grad.numel()
+        return 0
+
+    grad_memory = 2 * (
+        grad_numel(layer.router.layer.weight) + grad_numel(layer.experts.mlp.w1) + grad_numel(layer.experts.mlp.w2)
+    )
+    weight_memory += grad_memory
+
+    print('Weight Memory Allocated = {:0.0f}MiB'.format(weight_memory / 1e6))
+    print('Activation Memory Allocated = {:0.0f}MiB'.format((mem - weight_memory) / 1e6,),)
+
+    # Manually calculate GPU memory usage from the garbage
+    # collector.
+    gc.collect()
+    total = 0
+    tensors = get_tensors()
+    tensors = sorted(tensors, key=lambda x: -x.numel())
+    for i, t in enumerate(tensors):
+        total += t.numel()
+        print(f'{i}: {t.shape}, {t.numel() * 2}')
+    del tensors
+
+    print('Total Bytes Found = {:0.0f}MiB'.format(total * 2 / 1e6))
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _TESTS:
+        test_memory(group, *args)
diff --git a/torch-ext/megablocks/_layers/memory_test.sh b/torch-ext/megablocks/_layers/memory_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..acf5704654439b61c6987859e7c3d52a60203fb4
--- /dev/null
+++ b/torch-ext/megablocks/_layers/memory_test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+DISTRIBUTED_ARGUMENTS="\
+--nproc_per_node 1 \
+--nnodes 1 \
+--node_rank 0 \
+--master_addr localhost \
+--master_port 6000"
+
+python -m torch.distributed.launch \
+       ${DISTRIBUTED_ARGUMENTS} \
+       megablocks/layers/memory_test.py
diff --git a/torch-ext/megablocks/_layers/mlp.py b/torch-ext/megablocks/_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99afb9904c24a8b6a83e79059cd1251dbbfd99e
--- /dev/null
+++ b/torch-ext/megablocks/_layers/mlp.py
@@ -0,0 +1,587 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# try:
+#     import stk
+#     import stk.backend.triton_kernels
+#     import stk.ops
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/_layers/mlp.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from packaging import version
+
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+
+class ScaleGradient(torch.autograd.Function):
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx: Any, x: torch.Tensor, scale: float):
+        ctx.scale = scale
+        return x
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: torch.Tensor, grad: torch.Tensor):
+        return grad * ctx.scale, None
+
+
+scale_gradient = ScaleGradient.apply
+
+
+def resolve_dtensor(weight: torch.Tensor):
+    if version.parse(torch.__version__) >= version.parse('2.0.0'):
+        from torch.distributed._tensor import DTensor
+        if isinstance(weight, DTensor):
+            return weight.to_local()
+    return weight
+
+
+def create_moe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    ffn_hidden_size: int,
+    hidden_size: int,
+    init_method: InitFn,
+):
+    # Create the entire weight matrix such that the sampled weights will
+    # not vary between data parallelism and expert model parallelism for
+    # the same random seed.
+    master_weights = torch.empty(
+        num_experts,
+        ffn_hidden_size,
+        hidden_size,
+        device=args.device,
+        dtype=common.dtype(args),
+    )
+    init_method(master_weights)
+
+    if not args.moe_expert_model_parallelism:
+        return master_weights
+
+    # Calculate the amount of sharding in each dimension.
+    expert_sharding_degree = mpu.expert_sharding_degree(args)
+    hidden_sharding_degree = mpu.hidden_sharding_degree(args)
+
+    # Calculate the experts per rank.
+    #
+    # NOTE: We assign ranks to be expert parallel before going
+    # tensor parallel.
+    rank = mpu.get_expert_parallel_rank(args)
+    expert_rank = rank % expert_sharding_degree
+    num_experts_per_rank = num_experts // expert_sharding_degree
+    start_expert = expert_rank * num_experts_per_rank
+    end_expert = (expert_rank + 1) * num_experts_per_rank
+
+    # Calculate the rows per rank.
+    row_rank = rank // expert_sharding_degree
+    num_rows_per_rank = ffn_hidden_size // hidden_sharding_degree
+    start_row = row_rank * num_rows_per_rank
+    end_row = (row_rank + 1) * num_rows_per_rank
+
+    # Slice the weight matrix to get the chunk for this rank.
+    with torch.no_grad():
+        weights = master_weights[start_expert:end_expert, start_row:end_row]
+    return weights
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        # expert_parallel_world_size = mpu.get_expert_parallel_world_size(args)
+        experts_per_rank = mpu.experts_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                args.hidden_size,
+                mpu.features_per_rank(args),
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                experts_per_rank,
+                mpu.features_per_rank(args),
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            args.moe_expert_model_parallelism,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            args.moe_expert_model_parallelism,
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            w1 = create_moe_expert_weights(
+                args,
+                args.moe_num_experts,
+                args.ffn_hidden_size,
+                args.hidden_size,
+                args.init_method,
+            )
+            self.w1.copy_(w1.transpose(1, 2).contiguous())
+            self.w2.copy_(
+                create_moe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        x = torch.bmm(x, w1)
+        x = self.args.activation_fn(x)
+        return torch.bmm(x, w2)
+
+
+def create_dmoe_expert_weights(
+    args: Arguments,
+    num_experts: int,
+    rows: int,
+    columns: int,
+    init_method: InitFn,
+):
+    weights = create_moe_expert_weights(
+        args,
+        num_experts,
+        rows,
+        columns,
+        init_method,
+    )
+    return weights.view([-1, columns])
+
+
+class MemoryOptimizedMLP(torch.autograd.Function):
+    """Sparse MLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, topo, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        topo_tensors = (
+            topo.row_indices,
+            topo.column_indices,
+            topo.offsets,
+            topo.column_indices_t,
+            topo.offsets_t,
+            topo.block_offsets_t,
+        )
+
+        # Layer 0: x @ w1.t().
+        sdd_out = stk.ops.sdd(x, w1.t(), topo)
+
+        # GeLU.
+        activation_fn_out = act_fn(sdd_out, activation_fn)
+
+        # Layer 1: x @ w2.
+        dsd_out = stk.ops.dsd(activation_fn_out, w2)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.shape = topo.shape
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.data.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, *topo_tensors, x, sdd_out.data)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx, ddsd_out):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        topo_tensors = saved_tensors[2:8]
+        x = saved_tensors[8]
+        sdd_out_data = saved_tensors[9]
+
+        # rematerialize activation function output
+        activation_fn = ctx.activation_fn
+        sdd_out = stk.Matrix(ctx.shape, sdd_out_data, *topo_tensors)
+        activation_fn_out, activation_grad_fn = act_fn(
+            sdd_out,
+            activation_fn,
+            return_grad_fn=True,
+        )
+
+        # Compute dw2 with recomputed activation_fn output.
+        dw2 = stk.ops.dsd(activation_fn_out.t(), ddsd_out)
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        stk.backend.triton_kernels.sdd(
+            ddsd_out,
+            w2.t(),
+            dactivation_fn_out.shape,
+            dactivation_fn_out.data,
+            dactivation_fn_out.offsets,
+            dactivation_fn_out.row_indices,
+            dactivation_fn_out.column_indices,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out.data)
+            dsdd_out = stk.Matrix(ctx.shape, sdd_out.data.grad, *topo_tensors)
+
+        # Compute dw1.
+        dw1 = stk.ops.dsd(dsdd_out.t(), x)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        stk.backend.triton_kernels.dsd(
+            dsdd_out.shape,
+            dsdd_out.data,
+            dsdd_out.offsets,
+            dsdd_out.row_indices,
+            dsdd_out.column_indices,
+            dsdd_out.offsets_t,
+            dsdd_out.column_indices_t,
+            dsdd_out.block_offsets_t,
+            False,
+            w1,
+            ddsd_out,
+        )
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_mlp = MemoryOptimizedMLP.apply
+
+
+class SparseMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                self._num_rows_per_rank,
+                args.hidden_size,
+                device=args.device,
+                dtype=common.dtype(args),
+            ),
+        )
+
+        # Initialize the parameters for the MLP.
+        #
+        # NOTE: It is important that we create the weight tensors prior
+        # to creating the master weights and slicing our the piece for
+        # this rank. If the master weights are created first the PyTorch
+        # caching allocator appears to use the same memory block for these
+        # and the slice which causes large increases in our peak memory
+        # usage.
+        with torch.no_grad():
+            self.w1.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.init_method,
+                ),
+            )
+            self.w2.copy_(
+                create_dmoe_expert_weights(
+                    args,
+                    args.moe_num_experts,
+                    args.ffn_hidden_size,
+                    args.hidden_size,
+                    args.output_layer_init_method,
+                ),
+            )
+
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
+        mpu.set_expert_model_parallel_attributes(
+            self.w1,
+            self._should_set_parallelism_attribute,
+        )
+        mpu.set_expert_model_parallel_attributes(
+            self.w2,
+            self._should_set_parallelism_attribute,
+        )
+
+        self.gradient_scale = None
+        if self.args.moe_expert_model_parallelism:
+            self.gradient_scale = 1 / mpu.get_expert_parallel_world_size(self.args,)
+
+    def scale_grad(self, w):
+        if self.gradient_scale is None:
+            return w
+        return scale_gradient(w, self.gradient_scale)
+
+    def forward(self, x, topo):
+        w1, w2 = self.scale_grad(self.w1), self.scale_grad(self.w2)
+        w1, w2 = resolve_dtensor(w1), resolve_dtensor(w2)
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_mlp(
+                x,
+                w1,
+                w2,
+                topo,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        x = stk.ops.sdd(x, w1.t(), topo)
+        activation_fn_out = act_fn(x, self.args.activation_fn)
+        return stk.ops.dsd(activation_fn_out, w2)
+
+
+class MemoryOptimizedGroupedMLP(torch.autograd.Function):
+    """GroupedMLP with manually scheduled memory reuse."""
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_fwd(device_type='cuda')
+    def forward(ctx, x, w1, w2, batch_sizes, activation_fn):
+        # Cast inputs using ctx dtype from AMP
+        if ctx._fwd_used_autocast:
+            x = x.to(ctx._dtype)
+            w1 = w1.to(ctx._dtype)
+            w2 = w2.to(ctx._dtype)
+        # x: [m, k], w1: [n, k], w2: [n, k]
+        if (not x.is_contiguous() or not w1.is_contiguous() or not w2.is_contiguous()):
+            raise ValueError("Expected contiguous 'x', 'w1' and 'w2'.")
+
+        # Layer 0: x @ w1.t().
+        assert gg.backend is not None
+        sdd_out = gg.backend.gmm(x, w1, batch_sizes, trans_b=True)
+
+        # activation_fn
+        activation_fn_out = activation_fn(sdd_out)
+
+        # Layer 1: x @ w2.
+        dsd_out = gg.backend.gmm(activation_fn_out, w2, batch_sizes)
+
+        # NOTE: Save the input to the layer and the activation_fn input for
+        # gradient computation. We'll re-compute the activation_fn forward
+        # pass in the backward pass to avoid materializing another
+        # intermediate.
+        ctx.x_shape = x.shape
+        ctx.sdd_out_shape = sdd_out.shape
+        ctx.dtype = x.dtype
+        ctx.activation_fn = activation_fn
+        ctx.save_for_backward(w1, w2, batch_sizes, x, sdd_out)
+        return dsd_out
+
+    @staticmethod
+    @torch.amp.autocast_mode.custom_bwd(device_type='cuda')
+    def backward(ctx: Any, ddsd_out: torch.Tensor):
+        if (not ctx.needs_input_grad[0] or not ctx.needs_input_grad[1] or not ctx.needs_input_grad[2]):
+            raise ValueError('Expected all MLP inputs to need grad.')
+
+        # Unpack saved tensors
+        # dtype = ctx.dtype
+        saved_tensors = ctx.saved_tensors
+        w1, w2 = saved_tensors[:2]
+        batch_sizes = saved_tensors[2]
+        x = saved_tensors[3]
+        sdd_out = saved_tensors[4]
+
+        # Rematerialize activation_fn output.
+        activation_fn = ctx.activation_fn
+        with torch.set_grad_enabled(True):
+            sdd_out.requires_grad = True
+            activation_fn_out = activation_fn(sdd_out)
+            activation_grad_fn = activation_fn_out.backward
+
+        # Compute dw2 with recomputed activation_fn output.
+        assert gg.backend is not None
+        dw2 = gg.backend.gmm(
+            activation_fn_out,
+            ddsd_out,
+            batch_sizes,
+            trans_a=True,
+        )
+
+        # Compute dactivation_fn_out.
+        #
+        # NOTE: We reuse the activation_fn_out allocation.
+        dactivation_fn_out = activation_fn_out
+        gg.backend.gmm(
+            ddsd_out,
+            w2,
+            batch_sizes,
+            trans_b=True,
+            c=dactivation_fn_out,
+        )
+
+        # Compute dsdd_out.
+        #
+        # NOTE: This reuses the dactivation_fn_out allocation.
+        if activation_fn is DEFAULT_ACTIVATION_FN:
+            dsdd_out = gelu.gelu_backward_(dactivation_fn_out, sdd_out)
+        else:
+            assert activation_grad_fn is not None
+            activation_grad_fn(dactivation_fn_out)
+            dsdd_out = sdd_out.grad
+
+        # Compute dw1.
+        dw1 = gg.backend.gmm(dsdd_out, x, batch_sizes, trans_a=True)
+
+        # Compute dx.
+        #
+        # NOTE: This reuses the ddsd_out allocation.
+        gg.backend.gmm(dsdd_out, w1, batch_sizes, c=ddsd_out)
+        dx = ddsd_out
+        return dx, dw1, dw2, None, None
+
+
+memory_optimized_grouped_mlp = MemoryOptimizedGroupedMLP.apply
+
+
+class GroupedMLP(SparseMLP):
+
+    def forward(self, x, tokens_per_expert):
+        batch_sizes = tokens_per_expert.cpu().to(torch.long)
+        w1, w2 = (self.scale_grad(self.w1), self.scale_grad(self.w2))
+
+        # Re-shape the weights for the grouped GEMMs.
+        ne = mpu.experts_per_rank(self.args)
+        w1 = resolve_dtensor(w1).view(ne, -1, self.args.hidden_size)
+        w2 = resolve_dtensor(w2).view(ne, -1, self.args.hidden_size)
+
+        if self.args.memory_optimized_mlp:
+            return memory_optimized_grouped_mlp(
+                x,
+                w1,
+                w2,
+                batch_sizes,
+                self.args.activation_fn,
+            )
+
+        # Compute the MLP.
+        assert gg.ops is not None
+        x = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
+        x = self.args.activation_fn(x)
+        return gg.ops.gmm(x, w2, batch_sizes)
+
+
+class SharedMLP(torch.nn.Module):
+    """MLP for shared expert.
+
+    Note: this is a copy -> pasta -> modify of the LLM-Foundry MPTMLP class
+    """
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+        self.fc_kwargs: dict[str, Any] = {
+            'bias': args.bias,
+            'device': args.device,
+        }
+        self.fc_kwargs.update(args.fc_kwargs)
+
+        self.up_proj = args.fc_cls(
+            args.hidden_size,
+            args.shared_expert_hidden_size,
+            **self.fc_kwargs,
+        )
+        self.act = args.activation_fn
+        self.down_proj = args.fc_cls(
+            args.shared_expert_hidden_size,
+            args.hidden_size,
+            **self.fc_kwargs,
+        )
+        self.down_proj._is_residual = True  # a flag for llm-foundry init
+
+    def add_experts_sharedexpert(
+        self,
+        shared_expert_out: torch.Tensor,
+        expert_out: torch.Tensor,
+    ) -> torch.Tensor:
+        # Helper function to add expert output to shared expert output
+        # with optional weighted sum.
+        if self.args.shared_expert_weighted_sum:
+            # enable using weighted sum for shared expert output
+            # wieghted by number of experts used
+            t_experts = self.args.moe_top_k + 1
+            sh_mlp_out = shared_expert_out / t_experts
+            return sh_mlp_out.add(
+                expert_out,
+                alpha=(self.args.moe_top_k / t_experts),
+            )
+
+        return shared_expert_out + expert_out
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.up_proj(x)))
diff --git a/torch-ext/megablocks/_layers/moe.py b/torch-ext/megablocks/_layers/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4aeaacc9c86fc70944e730c53f7a55644e05e
--- /dev/null
+++ b/torch-ext/megablocks/_layers/moe.py
@@ -0,0 +1,507 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
+
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args: Arguments):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    # tokens_per_expert[i].shape = (num_experts)
+    # expert_scores[i].shape = (tokens, num_experts)
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = (args.num_layers // args.pipeline_model_parallel_size)
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} token_per_experts '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f'Expected {num_layers_per_pipeline_stage} expert_scores '
+            f'but found {len(tokens_per_expert)}.\nnum_layers = '
+            f'{args.num_layers}\npipeline_model_parallel_size = '
+            f'{args.pipeline_model_parallel_size}\n'
+            'num_layers_per_virtual_pipeline_stage'
+            f' = {args.num_layers_per_virtual_pipeline_stage}',
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all((x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert))
+
+    tokens = expert_scores[0].shape[0]
+    assert all(((x.ndim == 2 and x.shape[1] == args.moe_num_experts and x.shape[0] == tokens) for x in expert_scores))
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = (args.moe_num_experts * args.moe_loss_weight)
+    scale_denominator = (args.num_layers * tokens * args.moe_top_k)
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# NOTE: This class defines MoE expert computation, including expert model parallel
+# communication. When using FSDP on top of MegaBlocks this is the module that should
+# be wrapped s.t. the weight all-gathers can be scheduled *before* the expert model
+# parallel all2all.
+class ParallelMLP(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(ParallelMLP, self).__init__()
+        self.args = args
+
+        # Calculate the number of experts in total and the number of experts
+        # owned by this rank.
+        # world_size = mpu.get_expert_parallel_world_size(args)
+        self.num_experts = args.moe_num_experts
+        self.top_k = self.args.moe_top_k
+
+        # Calculate the number of bits needed to represent the expert indices
+        # so that we can pass it to radix sort.
+        self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1)
+
+        # Expert MLP.
+        self.mlp = mlp.MLP(args)
+
+        self.bias: Optional[torch.Tensor]
+        if self.args.bias:
+            # Note that the output bias is not parallelized with expert
+            # model parallelism.
+            self.bias = torch.nn.Parameter(
+                torch.empty(
+                    args.hidden_size,
+                    device=args.device,
+                    dtype=common.dtype(args),
+                ),
+            )
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.register_parameter('bias', None)
+
+        # Select the forward function for the operating mode.
+        self.forward_fn = (self.parallel_forward_once if args.moe_expert_model_parallelism else self.forward_once)
+
+    def expert_capacity(self, tokens: int) -> int:
+        world_size = mpu.get_expert_parallel_world_size(self.args)
+        tokens_per_expert = (self.top_k * tokens * world_size / self.num_experts)
+        return int(self.args.moe_capacity_factor * tokens_per_expert)
+
+    def load_balancing_loss(self, tokens_per_expert: torch.Tensor, expert_scores: torch.Tensor):
+        """Calculate the load balancing loss contribution."""
+        assert len(expert_scores.size()) == 2
+        tokens, num_experts = expert_scores.size()
+        assert num_experts == self.num_experts
+        assert len(tokens_per_expert.size()) == 1
+        num_experts, = tokens_per_expert.size()
+        assert num_experts == self.num_experts
+        scale = self.num_experts / (tokens * self.top_k)
+        return scale * torch.dot(
+            tokens_per_expert.to(expert_scores.dtype),
+            expert_scores.mean(dim=0),
+        )
+
+    def indices_and_bins(self,
+                         top_expert: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Sort the expert ids to produce the scatter/gather
+        # indices for the permutation.
+        #
+        # TODO(tgale): Is it worth doing this conversion to 32-bit
+        # prior? Could we place the `torch.max` operation to return
+        # 32-bit expert indices?
+        top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
+        assert output is not None
+        bin_ids, indices = output
+
+        # Histogram the expert ids to identify the number of
+        # tokens routed to each expert.
+        #
+        # TODO(tgale): Does the sorted data produce a more favorable
+        # data distribution for histogram? Or is the op parallelism
+        # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
+
+        # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
+        assert bins is not None
+        bins = bins.view(1) if not len(bins.size()) else bins
+
+        assert isinstance(indices, torch.Tensor)
+        assert isinstance(bin_ids, torch.Tensor)
+        assert isinstance(bins, torch.Tensor)
+        assert isinstance(tokens_per_expert, torch.Tensor)
+
+        return indices, bin_ids, bins, tokens_per_expert
+
+    def permute_and_compute(
+        self,
+        x: torch.Tensor,
+        tokens_per_expert: int,  # unused
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,  # unused
+        expert_weights: torch.Tensor,
+        bins: torch.Tensor,
+        expert_capacity: int,
+        top_k: int,
+    ):
+        # Route the tokens for MoE computation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
+        assert output is not None
+        x = output
+
+        # Perform the expert computation. Note that we don't
+        # use biases for these linear operations.
+        x = self.mlp(x)
+
+        # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
+    
+
+    def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # x: [sl, bs, hs]
+        # expert_weights: [sl * bs, top-k]
+        # top_experts: [sl * bs, top-k]
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            sl, bs, _ = x.size()
+            expert_capacity = self.expert_capacity(sl * bs)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(tokens_per_expert).item()
+
+        x = self.permute_and_compute(
+            x,
+            tokens_per_expert,
+            indices,
+            bin_ids,
+            expert_weights,
+            bins,
+            expert_capacity,
+            self.top_k,
+        )
+        return x, tokens_per_expert
+
+    def parallel_forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        # NOTE: This function implements the same computation as forward_once
+        # but with expert model parallelism.
+        #
+        # 1. Permute the tokens locally so that they are grouped by their
+        # expert assignments. This allows us to transfer all of the tokens
+        # for a remote device in one communication primitive.
+        #
+        # 2. Permute the tokens across the expert parallel devices. After
+        # this is completed each device has all of the tokens assigned to
+        # its set of experts in its local HBM.
+        #
+        # 3. Permute the tokens locally so that they are grouped by their
+        # expert assignement. After the distributed permutation the tokens
+        # are grouped by which device they came from. We re-order them
+        # locally to allow for efficient computation.
+        #
+        # After this series of permutations we compute the linear layers
+        # and then repeat these three steps in reverse to produce the final
+        # output.
+        #
+        # Compute the mapping of local tokens to experts.
+        expert_weights = expert_weights.flatten()
+        top_experts = top_experts.flatten()
+        with torch.no_grad():
+            indices, bin_ids, bins, tokens_per_expert = (self.indices_and_bins(top_experts))
+
+            # If we're sharding the experts along the hidden dimension
+            # multiple devices own parts of the same sets of experts.
+            # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
+                tokens_per_expert,
+                (mpu.hidden_sharding_degree(self.args),),
+            )
+
+            # Pass token count information to the device on which the
+            # target expert resides.
+            parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert,)
+            tpe_handle = dist.all_to_all_single(
+                parallel_tokens_per_expert,
+                repeated_tokens_per_expert,
+                group=self.args.expert_parallel_group,
+                async_op=True,
+            )
+
+        # Permute locally and without any padding so that tokens for each
+        # parallel device are stored contiguously.
+        #
+        # This view updates the shape of the tensor from [sl, bs, hs] to
+        # [sl * bs, hs] prior to the permutation.
+        x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
+        assert output is not None
+        x = output
+
+        # Compute the number of tokens that will be received from each
+        # device and permute the input data across the devices.
+        with torch.no_grad():
+            tpe_handle.wait()
+            experts_per_rank = mpu.experts_per_rank(self.args)
+
+            # Reshape to [world_size, num_experts_per_rank].
+            world_size = mpu.get_expert_parallel_world_size(self.args)
+            repeated_tokens_per_expert = (repeated_tokens_per_expert.view(world_size, experts_per_rank))
+            parallel_tokens_per_expert = (parallel_tokens_per_expert.view(world_size, experts_per_rank))
+
+            # TODO(tgale): It might be faster to do this on the GPU and
+            # then communicate the results back to the host.
+            send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1)
+            parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+            recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1)
+
+            # Convert the send/recv counts to lists.
+            send_counts = send_counts.tolist()
+            recv_counts = recv_counts.tolist()
+            tokens_received = sum(recv_counts)
+
+        # If we're sharding the experts along the hidden dimension
+        # multiple devices own parts of the same sets of experts.
+        # Replicate the token counts so devices that share experts
+        # get all of the tokens assigned to them.
+        #
+        # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+
+        # Start the cross-device permutation asynchronously so we can
+        # overlap communication with computation.
+        parallel_x, parallel_x_handle = all_to_all(
+            x,
+            recv_counts,
+            send_counts,
+            self.args.expert_parallel_group,
+            async_op=True,
+        )
+
+        with torch.no_grad():
+            # After we do the cross-device permutation we have the tokens on the
+            # correct device but not yet grouped by expert because we received
+            # tokens from each device as contiguous chunks. To group the tokens
+            # for expert computation we'll do one more local permutation. The
+            # rest of this torch.no_grad() scope sets up the indices and bins
+            # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
+                parallel_tokens_per_expert.flatten(),
+                0,
+            )
+            replicate_bins = (replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins)
+
+            # Construct the expert indices for the permuted tokens.
+            parallel_top_expert = torch.remainder(
+                torch.arange(
+                    self.num_experts * mpu.hidden_sharding_degree(self.args),
+                    dtype=torch.int32,
+                    device=indices.device,
+                ),
+                mpu.experts_per_rank(self.args),
+            )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
+                parallel_top_expert.unsqueeze(dim=0),
+                replicate_bins,
+                tokens_received,
+            ).flatten()
+
+            # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
+                parallel_top_expert,
+                self.sort_end_bit,
+            )
+
+            # Calculate the bins boundaries from the token counts.
+            parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
+
+            # If expert_capacity is set to zero, set the number of tokens
+            # per expert to the maximum we need to avoid dropping tokens.
+            tokens, _ = x.size()
+            expert_capacity = self.expert_capacity(tokens)
+            if expert_capacity == 0:
+                expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+        # Locally permute the tokens and perform the expert computation.
+        # Block to make sure that the cross-device permutation is complete.
+        if self.args.mlp_impl == 'grouped':
+            # GroupedMLP requires counts on CPU. We can use the tensor already
+            # moved to CPU for the prior all_to_all, which avoids an extra
+            # device synchronization.
+            parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+                dim=0,
+                dtype=torch.int,
+            )
+        parallel_x_handle.wait()
+        parallel_x = self.permute_and_compute(
+            parallel_x,
+            parallel_tokens_per_expert,
+            parallel_indices,
+            parallel_bin_ids,
+            None,  # expert_weights
+            parallel_bins,
+            expert_capacity,
+            top_k=1,
+        )
+
+        # Un-permute the tokens across the devices.
+        x, _ = all_to_all(
+            parallel_x,
+            send_counts,
+            recv_counts,
+            self.args.expert_parallel_group,
+        )
+
+        # Reduce along the hidden sharding to get the final outputs.
+        #
+        # TODO(tgale): Fuse this into the following local permutation.
+        shape = (
+            mpu.hidden_sharding_degree(self.args),
+            -1,
+            self.args.hidden_size,
+        )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
+
+        # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        return x, tokens_per_expert.flatten()
+
+    def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
+        in_shape = x.size()
+
+        # Compute the experts.
+        x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
+        if self.training and self.args.moe_loss_weight > 0:
+            save_load_balancing_loss((tokens_per_expert, scores))
+        x = x.view(in_shape)
+        if self.bias is not None:
+            if self.args.return_bias:
+                return x, self.bias
+            return x + self.bias
+        return x
+
+
+class MoE(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super(MoE, self).__init__()
+
+        # Token router.
+        self.router = router.LearnedRouter(args)
+
+        # Expert computation helper.
+        self.experts = self._init_experts_mlp(args)
+
+        self.shared_expert = None
+        if args.shared_expert:
+            # SharedExpert computation helper.
+            self.shared_expert = sharedexpert_registry.get(args)
+
+    def _init_experts_mlp(self, args: Arguments):
+        return ParallelMLP(args)
+
+    def forward(self, x: torch.Tensor):
+        # NOTE: If we're going to cast the activations to lower precision
+        # do it before we permute the tokens to save bandwidth.
+        x = common.cast_if_autocast_enabled(x)
+
+        # Compute the expert scores and assignments.
+        scores, expert_weights, top_experts = self.router(x)
+
+        # Compute the experts.
+        out = self.experts(x, scores, expert_weights, top_experts)
+        if self.shared_expert is not None:
+            shared_expert_out = self.shared_expert(x)
+            out = self.shared_expert.add_experts_sharedexpert(
+                shared_expert_out,
+                out,
+            )
+        return out
diff --git a/torch-ext/megablocks/_layers/mpu.py b/torch-ext/megablocks/_layers/mpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..434e143ab42bf3f83406d69e9dd1f72777716e22
--- /dev/null
+++ b/torch-ext/megablocks/_layers/mpu.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
+
+
+class MoeParam(torch.Tensor):
+
+    def __init__(self):
+        super().__init__(self)
+        self.expert_model_parallel: bool
+
+
+def is_moe_param(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, 'expert_model_parallel')
+
+
+def get_expert_parallel_world_size(args: Arguments) -> int:
+    return (dist.get_world_size(args.expert_parallel_group) if args.moe_expert_model_parallelism else 1)
+
+
+def get_expert_parallel_rank(args: Arguments) -> int:
+    return (dist.get_rank(args.expert_parallel_group) if args.moe_expert_model_parallelism else 0)
+
+
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, 'expert_model_parallel')
+    setattr(tensor, 'expert_model_parallel', is_parallel)
+
+
+def param_is_expert_model_parallel(param: MoeParam) -> bool:
+    return (hasattr(param, 'expert_model_parallel') and param.expert_model_parallel)
+
+
+def copy_expert_model_parallel_attributes(
+    destination_tensor: torch.Tensor,
+    source_tensor: torch.Tensor,
+):
+    if hasattr(source_tensor, 'expert_model_parallel'):
+        setattr(
+            destination_tensor,
+            'expert_model_parallel',
+            getattr(source_tensor, 'expert_model_parallel'),
+        )
+
+
+def synchronized_print(group: Optional[dist.ProcessGroup], *x: torch.Tensor):
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+    for i in range(world_size):
+        dist.barrier(group)
+        if i == rank:
+            print(f'rank = {rank}', *x)
+
+
+# Helpers for expert/tensor sharding.
+def expert_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = min(world_size, args.moe_num_experts)
+
+    if (args.moe_num_experts % esd) != 0:
+        raise ValueError(f'Cannot shard {args.moe_num_experts} experts {esd} ways.',)
+    return esd
+
+
+def hidden_sharding_degree(args: Arguments) -> int:
+    world_size = get_expert_parallel_world_size(args)
+    esd = expert_sharding_degree(args)
+    hsd = world_size // esd
+
+    if (args.ffn_hidden_size % hsd) != 0:
+        raise ValueError(f'Cannot shard {args.ffn_hidden_size} features {hsd} ways.',)
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. 'expert_sharding_degree' ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size}).",
+        )
+    return hsd
+
+
+def experts_per_rank(args: Arguments) -> int:
+    return args.moe_num_experts // expert_sharding_degree(args)
+
+
+def features_per_rank(args: Arguments) -> int:
+    return args.ffn_hidden_size // hidden_sharding_degree(args)
diff --git a/torch-ext/megablocks/_layers/router.py b/torch-ext/megablocks/_layers/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cb2782348d62583376f1a183c7ede83601216d
--- /dev/null
+++ b/torch-ext/megablocks/_layers/router.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
+
+_ROUTER_LOGITS = []
+
+
+def _save_router_logits(logits: torch.Tensor, args: Arguments):
+    if args.moe_zloss_weight == 0:
+        return
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.append(logits)
+
+
+def clear_router_zloss():
+    global _ROUTER_LOGITS
+    _ROUTER_LOGITS.clear()
+
+
+def batched_router_zloss(args: Arguments):
+    global _ROUTER_LOGITS
+
+    if args.moe_zloss_weight == 0:
+        import warnings
+        warnings.warn('Call to batched_router_zloss, but moe_zloss_weight=0')
+        return 0
+
+    logits_per_router = _ROUTER_LOGITS
+
+    if args.moe_zloss_in_fp32:
+        logits_per_router = [logits.float() for logits in logits_per_router]
+
+    unscaled_zloss_per_router = torch.stack([
+        torch.logsumexp(logits, dim=1).square().mean() for logits in logits_per_router
+    ])
+
+    return args.moe_zloss_weight * unscaled_zloss_per_router
+
+
+# NOTE: To enable end-to-end benchmarking without convergence we
+# support a flag to force the router to assign tokens uniformly
+# across the experts. We do this with a custom autograd operation
+# so that PyTorch still executes the full set of router operation.
+class _UniformExpertAssignment(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, num_experts: int):
+        out = torch.arange(x.numel(), dtype=x.dtype, device=x.device)
+        out = torch.remainder(out, num_experts)
+        return out.view(x.shape)
+
+
+_uniform_expert_assignment = _UniformExpertAssignment.apply
+
+
+class LearnedRouter(torch.nn.Module):
+
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.args = args
+
+        # Learned router parameters.
+        #
+        # NOTE: This weight matrix is not parallelized with expert model
+        # parallelism. Each device needs the entire router weight matrix
+        # so that it can route its batch of data correctly.
+        self.layer = torch.nn.Linear(
+            args.hidden_size,
+            args.moe_num_experts,
+            bias=False,
+            dtype=common.dtype(args),
+            device=args.device,
+        )
+        args.init_method(self.layer.weight)
+
+    def jitter(self, x: torch.Tensor):
+        low: float = 1.0 - self.args.moe_jitter_eps
+        high: float = 1.0 + self.args.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def _top_k(self, scores: torch.Tensor):
+        if self.args.moe_top_k == 1:
+            return scores.max(dim=-1, keepdim=True)
+        return torch.topk(scores, self.args.moe_top_k, dim=-1)
+
+    def forward(self, x: torch.Tensor):
+        if self.training and self.args.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        logits = self.layer(x.view(-1, x.shape[-1]))
+        _save_router_logits(logits, self.args)
+        scores = logits.softmax(dim=-1)
+        expert_weights, expert_indices = self._top_k(scores)
+        if self.args.moe_normalize_expert_weights:
+            expert_weights = expert_weights / torch.norm(
+                expert_weights,
+                p=self.args.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True,
+            )
+
+        expert_indices = (
+            _uniform_expert_assignment(
+                expert_indices,
+                self.args.moe_num_experts,
+            ) if self.args.uniform_expert_assignment else expert_indices
+        )
+        return scores, expert_weights, expert_indices
diff --git a/torch-ext/megablocks/_layers/sharedexpert_registry.py b/torch-ext/megablocks/_layers/sharedexpert_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..5840862f88f370ace5fd49bd0612fc98d186cc49
--- /dev/null
+++ b/torch-ext/megablocks/_layers/sharedexpert_registry.py
@@ -0,0 +1,32 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
+
+_REGISTRY = {
+    'mlp': mlp.SharedMLP,
+    'glu': glu.SharedGLU,
+}
+
+
+def get(args: Arguments) -> Union[mlp.SharedMLP, glu.SharedGLU]:
+    """Returns an SharedMLP for use in a dMoE instance.
+
+    Uses the provided arguments to instantiate the appropriate
+    SharedMLP instance.
+
+    Args:
+        args: propagated Arguments dataclass.
+
+    Returns:
+        An instantiated SharedMLP constructed using the input args.
+    """
+    if args.mlp_type not in _REGISTRY:
+        raise ValueError(f'Unsupported mlp type: {args.mlp_type}')
+
+    return _REGISTRY[args.mlp_type](args)
diff --git a/torch-ext/megablocks/_version.py b/torch-ext/megablocks/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55783177af19bc03654c730c4892df8f8532279
--- /dev/null
+++ b/torch-ext/megablocks/_version.py
@@ -0,0 +1,6 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+"""The MegaBlocks Version."""
+
+__version__ = '0.11.0.dev0'
diff --git a/torch-ext/megablocks/backend/__init__.py b/torch-ext/megablocks/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e43e9b7e6a9a1c3bde2df34914643ca5d8332
--- /dev/null
+++ b/torch-ext/megablocks/backend/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
diff --git a/torch-ext/megablocks/backend/kernels.py b/torch-ext/megablocks/backend/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20139324883338ddb312e4b05a72056d74491ac
--- /dev/null
+++ b/torch-ext/megablocks/backend/kernels.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+# Stub triton autotune when testing in a env that does not have CUDA
+# this approach preserves the original code but enables testing without a GPU
+if torch.cuda.is_available() is False:
+    import warnings 
+
+    warnings.warn("CUDA is not available. Triton autotuning is disabled.")
+
+    def _no_autotune(*args, **kwargs):
+        def deco(fn):
+            return fn
+        return deco
+
+    triton.autotune = _no_autotune
+
+
+def assert_is_tensor(x, ndim):
+    if x.ndim != ndim:
+        raise ValueError(f'Expected {ndim}-tensor but got {x.ndim}-tensor')
+
+
+def assert_is_matrix(x):
+    assert_is_tensor(x, 2)
+
+
+def assert_is_vector(x):
+    if x.ndim != 1:
+        raise ValueError(f'Expected 1-tensor but got {x.ndim}-tensor')
+
+
+def assert_equal(a, b):
+    if a != b:
+        raise ValueError(f'Expected dimensions to be equal but got {a} and {b}.',)
+
+
+# a: (tokens, hidden_size), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy(
+    a,
+    b,
+    indices,
+    bin_ids,
+    weights,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Our index into array 'a'.
+    index_a = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'b'.
+    index_b = offset_in_bin
+    if bin_idx > 0:
+        index_b += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def padded_gather(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: Because of the padding, the output size is dynamic.
+    # We load the final padded bin bound to get the output rows.
+    output_rows = padded_bins[-1].cpu().item()
+    out = torch.zeros((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def gather(x, indices, bin_ids, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+    assert_equal(bin_ids.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    # NOTE: There is no padding so the output rows equals the
+    # input rows multiplied by top_k.
+    output_rows = x.shape[0] * top_k
+    out = torch.empty((output_rows, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        x,
+        out,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def padded_scatter(x, indices, bin_ids, weights, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens, top_k, x.shape[1]), dtype=x.dtype, device=x.device)
+    _padded_copy[(indices.shape[0],)](
+        out,
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, x.shape[1])
+
+
+def scatter(x, indices, bin_ids, weights, bins, top_k):
+    return padded_scatter(x, indices, bin_ids, weights, bins, bins, top_k)
+
+
+# x: (tokens, top_k, hidden_size), real
+# grad: (tokens, hidden_size), real.
+# wgrad: (tokens, top_k), real.
+# indices: (tokens * top_k), integer.
+# bin_ids: (tokens * top_k), integer.
+# bins: (num_experts), integer.
+# padded_bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _padded_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    indices,
+    bin_ids,
+    bins,
+    padded_bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Our index into 'tokens * top_k'.
+    index_out = tl.load(indices + tl.program_id(0))
+
+    # One threadblock per row in 'a'. Array 'b' has greater or equal
+    # number of rows since they could be padded.
+    bin_idx = tl.load(bin_ids + tl.program_id(0))
+
+    # Now we know what bin we're assigned to, but we need to know how
+    # many threadblocks were assigned to earlier bins so we can offset
+    # in our bin properly.
+    offset_in_bin = tl.program_id(0)
+    if bin_idx > 0:
+        offset_in_bin -= tl.load(bins + bin_idx - 1)
+
+    # Load the starting index of our bin in array 'x'.
+    index_x = offset_in_bin
+    if bin_idx > 0:
+        index_x += tl.load(padded_bins + bin_idx - 1)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def padded_scatter_wgrad(x, grad, indices, bin_ids, bins, padded_bins, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bin_ids)
+    assert_is_vector(bins)
+    assert_is_vector(padded_bins)
+    assert_equal(indices.shape[0], bin_ids.shape[0])
+    assert_equal(bins.size(), padded_bins.size())
+
+    tokens = indices.shape[0] // top_k
+    out = torch.empty((tokens * top_k), dtype=x.dtype, device=x.device)
+    _padded_copy_wgrad[(indices.shape[0],)](
+        x,
+        grad,
+        out,
+        indices,
+        bin_ids,
+        bins,
+        padded_bins,
+        NUM_COLUMNS=x.shape[1],
+        TOP_K=top_k,
+    )
+    return out
+
+
+def scatter_wgrad(x, grad, indices, bin_ids, bins, top_k):
+    return padded_scatter_wgrad(x, grad, indices, bin_ids, bins, bins, top_k)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy(
+    a,
+    b,
+    num_experts,
+    expert_capacity,
+    indices,
+    weights,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+    A_TO_B: tl.constexpr,
+    SCALE: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_b = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_a = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    #
+    # If we're going from A to B, divide the input index to copy
+    # the same input repeatedly. If we're going from B to A we
+    # need to reduce the result. Using atomics is slow, so we
+    # do the reduce step in a second kernel.
+    offset = index_a // TOP_K if A_TO_B else index_a
+    a += tl.multiple_of(offset * NUM_COLUMNS, NUM_COLUMNS)
+    b += tl.multiple_of(index_b * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    # Load the scale, if requested.
+    scale = tl.load(weights + index_a) if SCALE else 1
+
+    # Swap the pointers depending on the direction.
+    #
+    # NOTE: We need to zero the output in both directions.
+    iptr = a if A_TO_B else b
+    optr = b if A_TO_B else a
+
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        x = tl.load(iptr + offsets, mask=mask)
+        x = x.to(tl.float32) * scale.to(tl.float32)
+
+        tl.store(optr + offsets, x.to(optr.dtype.element_ty), mask=mask)
+
+        offsets += BLOCK_X
+
+
+def binned_gather(x, indices, weights, bins, expert_capacity, top_k):
+    # Validate the input shapes.
+    assert_is_matrix(x)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(indices.shape[0], x.shape[0] * top_k)
+
+    if weights is not None:
+        assert_equal(weights.shape[0], x.shape[0] * top_k)
+
+    num_experts = bins.shape[0]
+    out = torch.zeros((num_experts, expert_capacity, x.shape[1]), dtype=x.dtype, device=x.device)
+
+    _binned_copy[(num_experts, expert_capacity)](
+        x,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=x.shape[1],
+        A_TO_B=True,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+    return out
+
+
+def binned_scatter(x, indices, weights, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    if weights is not None:
+        assert_equal(indices.shape[0], weights.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens, top_k, hidden_size), dtype=x.dtype, device=x.device)
+    _binned_copy[(num_experts, expert_capacity)](
+        out,
+        x,
+        num_experts,
+        expert_capacity,
+        indices,
+        weights,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        A_TO_B=False,
+        TOP_K=top_k,
+        SCALE=weights is not None,
+    )
+
+    # Reduce along the top-k dimension, if needed.
+    return out.sum(dim=1) if top_k > 1 else out.view(tokens, hidden_size)
+
+
+# a: (tokens, hidden_size), real.
+# b: (num_experts, expert_capacity, num_columns), real.
+# indices: (tokens * top_k), integer.
+# weights: (tokens * top_k), real.
+# bins: (num_experts), integer.
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_X': 64}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=2),
+        triton.Config({'BLOCK_X': 256}, num_warps=2),
+        triton.Config({'BLOCK_X': 128}, num_warps=4),
+        triton.Config({'BLOCK_X': 256}, num_warps=4),
+    ],
+    key=['NUM_COLUMNS'],
+)
+@triton.jit
+def _binned_copy_wgrad(
+    x,
+    grad,
+    wgrad,
+    num_experts,
+    expert_capacity,
+    indices,
+    bins,
+    NUM_COLUMNS: tl.constexpr,
+    TOP_K: tl.constexpr,
+    BLOCK_X: tl.constexpr,
+):
+    # Load our indices into the output.
+    expert_idx = tl.program_id(0)
+    entry_idx = tl.program_id(1)
+
+    # Calculate our offset into the output.
+    index_x = expert_idx * expert_capacity + entry_idx
+
+    # Load the index bounds for our bin and calculate
+    # the number of tokens assigned to our expert.
+    start = 0
+    if expert_idx > 0:
+        start = tl.load(bins + expert_idx - 1)
+    end = tl.load(bins + expert_idx)
+    num_tokens = end - start
+
+    # Calculate our offset into the input. If we don't
+    # have an input exit early.
+    if entry_idx >= num_tokens:
+        return
+    index_out = tl.load(indices + start + entry_idx)
+
+    # Offset the input and output pointers.
+    wgrad += index_out
+    grad += tl.multiple_of((index_out // TOP_K) * NUM_COLUMNS, NUM_COLUMNS)
+    x += tl.multiple_of(index_x * NUM_COLUMNS, NUM_COLUMNS)
+    offsets = tl.max_contiguous(tl.arange(0, BLOCK_X), BLOCK_X)
+
+    acc = tl.zeros((BLOCK_X,), dtype=tl.float32)
+    iterations = tl.cdiv(NUM_COLUMNS, BLOCK_X)
+    for _ in range(iterations):
+        mask = offsets < NUM_COLUMNS
+        data = tl.load(x + offsets, mask=mask).to(tl.float32)
+        scale = tl.load(grad + offsets, mask=mask).to(tl.float32)
+        acc += data * scale
+        offsets += BLOCK_X
+
+    # Reduce to get the final result and store.
+    out = tl.sum(acc).to(wgrad.dtype.element_ty)
+    tl.store(wgrad, out)
+
+
+def binned_scatter_wgrad(x, grad, indices, bins, top_k):
+    # Validate the input shapes.
+    assert_is_tensor(x, 3)
+    assert_is_matrix(grad)
+    assert_is_vector(indices)
+    assert_is_vector(bins)
+    assert_equal(bins.shape[0], x.shape[0])
+
+    num_experts, expert_capacity, hidden_size = x.shape
+    tokens = indices.shape[0] // top_k
+    out = torch.zeros((tokens * top_k), dtype=x.dtype, device=x.device)
+    _binned_copy_wgrad[(num_experts, expert_capacity)](
+        x,
+        grad,
+        out,
+        num_experts,
+        expert_capacity,
+        indices,
+        bins,
+        NUM_COLUMNS=hidden_size,
+        TOP_K=top_k,
+    )
+    return out
diff --git a/torch-ext/megablocks/bak.__init__.py b/torch-ext/megablocks/bak.__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5217959caf74527e3bf7f80db6f93be21c016963
--- /dev/null
+++ b/torch-ext/megablocks/bak.__init__.py
@@ -0,0 +1,23 @@
+from megablocks_moe.megablocks import (
+    MoE,
+    dMoE,
+    get_load_balancing_loss,
+    ParallelMLP,
+    ParallelDroplessMLP,
+    SparseMLP,
+    MLP,
+    SparseGLU,
+    Arguments,
+)
+
+__all__ = [
+    "MoE",
+    "dMoE",
+    "get_load_balancing_loss",
+    "ParallelMLP",
+    "ParallelDroplessMLP",
+    "SparseMLP",
+    "MLP",
+    "SparseGLU",
+    "Arguments",
+]
diff --git a/torch-ext/megablocks/benchmark_util.py b/torch-ext/megablocks/benchmark_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..02612d95e3ead1175a596e2878fa34b5bf85ad6f
--- /dev/null
+++ b/torch-ext/megablocks/benchmark_util.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+
+
+def log_benchmark(name, arguments, time, std):
+    print('=' * 60)
+    print(f'{name} Benchmark')
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean time = {:.3f}ms, std time = {:.3f}ms'.format(time, std))
+    print('=' * 60)
+
+
+def benchmark_function(fn, iterations=100, warmup=10):
+    # Warmup iterations.
+    for _ in range(warmup):
+        fn()
+
+    times = []
+    for i in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    return np.mean(times), np.std(times)
diff --git a/torch-ext/megablocks/grouped_gemm/__init__.py b/torch-ext/megablocks/grouped_gemm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91c8308f0c24f4c4171b6e4f15b6f76dabf295a
--- /dev/null
+++ b/torch-ext/megablocks/grouped_gemm/__init__.py
@@ -0,0 +1,2 @@
+from . import ops
+from . import backend
diff --git a/torch-ext/megablocks/grouped_gemm/backend.py b/torch-ext/megablocks/grouped_gemm/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..76037d8039cbfc2f0577275c78e4bc0be762592a
--- /dev/null
+++ b/torch-ext/megablocks/grouped_gemm/backend.py
@@ -0,0 +1,33 @@
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
+
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c
diff --git a/torch-ext/megablocks/grouped_gemm/ops.py b/torch-ext/megablocks/grouped_gemm/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b30dd14e23837ea3b12334f4e31337ed9ad2b69
--- /dev/null
+++ b/torch-ext/megablocks/grouped_gemm/ops.py
@@ -0,0 +1,33 @@
+from . import backend
+import torch
+
+
+class GroupedGemm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+
+
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)
diff --git a/torch-ext/megablocks/grouped_gemm_util.py b/torch-ext/megablocks/grouped_gemm_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6d49fc46b0a57ad46e4179df3cc1ac2a24f7ae
--- /dev/null
+++ b/torch-ext/megablocks/grouped_gemm_util.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+
+_grouped_gemm_is_available: bool = False
+try:
+    # import grouped_gemm
+    pass
+    _grouped_gemm_is_available = True
+except ImportError as error:
+    warnings.warn('Grouped GEMM not available.')
+
+
+def grouped_gemm_is_available():
+    return _grouped_gemm_is_available
+
+
+def assert_grouped_gemm_is_available():
+    msg = (
+        'Grouped GEMM not available. Please run '
+        '`pip install git+https://github.com/tgale96/grouped_gemm@main`.',
+    )
+    assert _grouped_gemm_is_available, msg
+
+
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+
+
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend
diff --git a/torch-ext/megablocks/layers.py b/torch-ext/megablocks/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b835ac5f6929edb8b547f373212388f34be3868
--- /dev/null
+++ b/torch-ext/megablocks/layers.py
@@ -0,0 +1,1225 @@
+import torch
+import torch.distributed as dist
+
+from typing import Optional, Any, TYPE_CHECKING
+
+from . import _layers
+from . import ops
+
+# Conditional import for meta kernel registration
+if TYPE_CHECKING:
+
+    def register_fake(fn):
+        return lambda name: fn
+
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        try:
+            from torch.library import impl_abstract as register_fake
+        except ImportError:
+            # Fallback for older PyTorch versions
+            def register_fake(op_name):
+                def decorator(fn):
+                    return fn
+
+                return decorator
+
+
+# Meta kernel implementations for torch.compile compatibility
+def _install_meta_kernels():
+    """Install meta kernels for existing MegaBlocks operations"""
+
+    # Create wrapper functions that check for compilation and return meta tensors
+
+    # Patch ops.sort
+    if hasattr(ops, "sort"):
+        original_sort = ops.sort
+
+        def sort_with_meta(x, end_bit=None):
+            if torch.compiler.is_compiling():
+                print("Using meta kernel for sort")
+                # Meta implementation - return tensors with correct shape/dtype/device
+                return torch.empty_like(x), torch.empty_like(x)
+            # print("Using original sort kernel")
+            return original_sort(x, end_bit)
+
+        ops.sort = sort_with_meta
+
+    # Patch ops.histogram
+    if hasattr(ops, "histogram"):
+        original_histogram = ops.histogram
+
+        def histogram_with_meta(x, max_val):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty((max_val,), dtype=torch.int32, device=x.device)
+            return original_histogram(x, max_val)
+
+        ops.histogram = histogram_with_meta
+
+    # Patch ops.inclusive_cumsum
+    if hasattr(ops, "inclusive_cumsum"):
+        original_inclusive_cumsum = ops.inclusive_cumsum
+
+        def inclusive_cumsum_with_meta(x, dim):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty_like(x)
+            return original_inclusive_cumsum(x, dim)
+
+        ops.inclusive_cumsum = inclusive_cumsum_with_meta
+
+    # Patch ops.binned_gather
+    if hasattr(ops, "binned_gather"):
+        original_binned_gather = ops.binned_gather
+
+        def binned_gather_with_meta(x, indices, bins, bin_size, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - output shape based on bin_size
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (bin_size, x.size(1), hidden_size),
+                        dtype=x.dtype,
+                        device=x.device,
+                    )
+                else:
+                    return torch.empty((bin_size,), dtype=x.dtype, device=x.device)
+            return original_binned_gather(x, indices, bins, bin_size, top_k)
+
+        ops.binned_gather = binned_gather_with_meta
+
+    # Patch ops.binned_scatter
+    if hasattr(ops, "binned_scatter"):
+        original_binned_scatter = ops.binned_scatter
+
+        def binned_scatter_with_meta(x, indices, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - typically reduces to 2D
+                if x.dim() >= 3:
+                    return torch.empty(
+                        (x.size(1), x.size(2)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty_like(x)
+            return original_binned_scatter(x, indices, weights, bins, top_k)
+
+        ops.binned_scatter = binned_scatter_with_meta
+
+    # Patch ops.gather
+    if hasattr(ops, "gather"):
+        original_gather = ops.gather
+
+        def gather_with_meta(x, indices, bin_ids, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if x.dim() >= 2:
+                    hidden_size = x.size(-1)
+                    return torch.empty(
+                        (indices.numel(), hidden_size), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty(indices.shape, dtype=x.dtype, device=x.device)
+            return original_gather(x, indices, bin_ids, bins, top_k)
+
+        ops.gather = gather_with_meta
+
+    # Patch ops.scatter
+    if hasattr(ops, "scatter"):
+        original_scatter = ops.scatter
+
+        def scatter_with_meta(x, indices, bin_ids, weights, bins, top_k):
+            if torch.compiler.is_compiling():
+                # Meta implementation - restore sequence shape
+                seq_len = (
+                    indices.size(0) // top_k
+                    if indices.numel() > 0 and top_k > 0
+                    else x.size(0)
+                )
+                if x.dim() >= 2:
+                    return torch.empty(
+                        (seq_len, x.size(-1)), dtype=x.dtype, device=x.device
+                    )
+                else:
+                    return torch.empty((seq_len,), dtype=x.dtype, device=x.device)
+            return original_scatter(x, indices, bin_ids, weights, bins, top_k)
+
+        ops.scatter = scatter_with_meta
+
+    # Patch ops.replicate
+    if hasattr(ops, "replicate"):
+        original_replicate = ops.replicate
+
+        def replicate_with_meta(x, bins, num_outputs):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                return torch.empty(
+                    (x.shape[0], num_outputs), dtype=x.dtype, device=x.device
+                )
+            return original_replicate(x, bins, num_outputs)
+
+        ops.replicate = replicate_with_meta
+
+    # Patch ops.repeat (if it's a regular function)
+    if hasattr(ops, "repeat"):
+        original_repeat = ops.repeat
+
+        def repeat_with_meta(x, repeats):
+            if torch.compiler.is_compiling():
+                # Meta implementation
+                if isinstance(repeats, (tuple, list)):
+                    new_shape = list(x.shape)
+                    for i, rep in enumerate(repeats):
+                        if i < len(new_shape):
+                            new_shape[i] *= rep
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+                else:
+                    new_shape = [x.size(0) * repeats] + list(x.shape[1:])
+                    return torch.empty(new_shape, dtype=x.dtype, device=x.device)
+            return original_repeat(x, repeats)
+
+        ops.repeat = repeat_with_meta
+
+
+# Install meta kernels on import
+try:
+    _install_meta_kernels()
+except Exception as e:
+    # If meta kernel installation fails, continue without them
+    # torch.compile may not work but the library will still function
+    import warnings
+
+    warnings.warn(
+        f"Failed to install meta kernels for torch.compile support: {e}", UserWarning
+    )
+
+
+# Set the expert model parallel attributes on a tensor
+def set_expert_model_parallel_attributes(
+    tensor: torch.Tensor,
+    is_parallel: bool,
+):
+    assert not hasattr(tensor, "expert_model_parallel")
+    setattr(tensor, "expert_model_parallel", is_parallel)
+
+
+# Get the expert model parallel attributes from a tensor
+def expert_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+) -> int:
+    esd = min(world_size, moe_num_experts)
+    if (moe_num_experts % esd) != 0:
+        raise ValueError(f"Cannot shard {moe_num_experts} experts {esd} ways.")
+    return esd
+
+
+# Calculate the hidden sharding degree based on world size and expert sharding degree
+def hidden_sharding_degree(
+    world_size: int,
+    moe_num_experts: int,
+    ffn_hidden_size: int,
+) -> int:
+    esd = expert_sharding_degree(world_size, moe_num_experts)
+    hsd = world_size // esd
+    if (ffn_hidden_size % hsd) != 0:
+        raise ValueError(f"Cannot shard {ffn_hidden_size} features {hsd} ways.")
+    if (esd * hsd) != world_size:
+        raise ValueError(
+            f"Invalid sharding. expert_sharding_degree ({esd}) * hidden_sharding_degree ({hsd}) != world_size ({world_size})."
+        )
+    return hsd
+
+
+# Calculate the number of experts per rank based on world size and expert sharding degree
+def experts_per_rank(
+    moe_num_experts: int,
+    world_size: int,
+) -> int:
+    return moe_num_experts // expert_sharding_degree(world_size, moe_num_experts)
+
+
+# Calculate the number of features per rank based on ffn hidden size and hidden sharding degree
+def features_per_rank(
+    ffn_hidden_size: int, world_size: int, moe_num_experts: int
+) -> int:
+    return ffn_hidden_size // hidden_sharding_degree(
+        world_size, moe_num_experts, ffn_hidden_size
+    )
+
+
+# Apply jitter to the input tensor
+def apply_jitter(x: torch.Tensor, moe_jitter_eps: float) -> torch.Tensor:
+    low = 1.0 - moe_jitter_eps
+    high = 1.0 + moe_jitter_eps
+    noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+    return x * (low + noise * (high - low))
+
+
+# Compute the top-k scores from the logits
+def compute_top_k(scores: torch.Tensor, moe_top_k: int):
+    if moe_top_k == 1:
+        return scores.max(dim=-1, keepdim=True)
+    return torch.topk(scores, moe_top_k, dim=-1)
+
+
+# Route tokens to experts and compute expert weights and indices
+def route_tokens(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if training and moe_jitter_eps is not None:
+        x = apply_jitter(x, moe_jitter_eps)
+
+    x_flat = x.view(-1, x.shape[-1])
+    logits = torch.nn.functional.linear(x_flat, router_weight, router_bias)
+    expert_weights, expert_indices = compute_top_k(logits, moe_top_k)
+    expert_weights = expert_weights.softmax(dim=-1)
+    if moe_normalize_expert_weights is not None:
+        expert_weights = expert_weights / torch.norm(
+            expert_weights,
+            p=moe_normalize_expert_weights,
+            dim=-1,
+            keepdim=True,
+        )
+    if uniform_expert_assignment:
+        expert_indices = _layers.router._uniform_expert_assignment(
+            expert_indices,
+            moe_num_experts,
+        )
+
+    return logits, expert_weights, expert_indices
+
+
+# Scale the gradient of the weights
+def scale_grad(
+    w: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    if gradient_scale is None:
+        return w
+    return _layers.mlp.scale_gradient(w, gradient_scale)
+
+
+# Forward pass for the MLP layer
+def mlp_forward(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+):
+    # Scale weights
+    w1 = scale_grad(w1, gradient_scale)
+    w2 = scale_grad(w2, gradient_scale)
+    w1_bias = scale_grad(w1_bias, gradient_scale)
+    w2_bias = scale_grad(w2_bias, gradient_scale)
+
+    # Resolve dtensors
+    w1 = _layers.mlp.resolve_dtensor(w1)
+    w2 = _layers.mlp.resolve_dtensor(w2)
+    w1_bias = _layers.mlp.resolve_dtensor(w1_bias)
+    w2_bias = _layers.mlp.resolve_dtensor(w2_bias)
+
+    # Forward pass
+    gate_up = torch.bmm(x, w1) + w1_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), w2)
+    next_states += w2_bias[..., None, :]
+    return next_states
+
+# Shared expert MLP forward pass
+def shared_mlp_forward(
+    x: torch.Tensor,
+    up_proj_weight: torch.Tensor,
+    down_proj_weight: torch.Tensor,
+    up_proj_bias: Optional[torch.Tensor] = None,
+    down_proj_bias: Optional[torch.Tensor] = None,
+    activation_fn: Optional[Any] = None,
+    gradient_scale: Optional[float] = None,
+) -> torch.Tensor:
+    # Default activation function
+    if activation_fn is None:
+        activation_fn = torch.nn.functional.gelu
+
+    # Scale weights
+    up_proj_weight = scale_grad(up_proj_weight, gradient_scale)
+    down_proj_weight = scale_grad(down_proj_weight, gradient_scale)
+    if up_proj_bias is not None:
+        up_proj_bias = scale_grad(up_proj_bias, gradient_scale)
+    if down_proj_bias is not None:
+        down_proj_bias = scale_grad(down_proj_bias, gradient_scale)
+
+    # Resolve dtensors
+    up_proj_weight = _layers.mlp.resolve_dtensor(up_proj_weight)
+    down_proj_weight = _layers.mlp.resolve_dtensor(down_proj_weight)
+    if up_proj_bias is not None:
+        up_proj_bias = _layers.mlp.resolve_dtensor(up_proj_bias)
+    if down_proj_bias is not None:
+        down_proj_bias = _layers.mlp.resolve_dtensor(down_proj_bias)
+
+    # Up projection
+    x = torch.nn.functional.linear(x, up_proj_weight, up_proj_bias)
+
+    # Activation
+    x = activation_fn(x)
+
+    # Down projection
+    x = torch.nn.functional.linear(x, down_proj_weight, down_proj_bias)
+
+    return x
+
+
+# Combine outputs from shared expert and regular experts
+def combine_expert_shared_outputs(
+    shared_expert_out: torch.Tensor,
+    expert_out: torch.Tensor,
+    shared_expert_weighted_sum: bool = False,
+    moe_top_k: int = 1,
+) -> torch.Tensor:
+    if shared_expert_weighted_sum:
+        # Weighted sum based on number of experts used
+        total_experts = moe_top_k + 1
+        shared_weight = 1.0 / total_experts
+        expert_weight = moe_top_k / total_experts
+        return shared_expert_out * shared_weight + expert_out * expert_weight
+    else:
+        # Simple addition
+        return shared_expert_out + expert_out
+
+
+# Global variable to store load balancing loss
+_LOAD_BALANCING_LOSS = []
+
+
+def save_load_balancing_loss(loss):
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.append(loss)
+
+
+def get_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    return _LOAD_BALANCING_LOSS
+
+
+def clear_load_balancing_loss():
+    global _LOAD_BALANCING_LOSS
+    _LOAD_BALANCING_LOSS.clear()
+
+
+def batched_load_balancing_loss(args):
+    if args.moe_loss_weight == 0:
+        return 0.0
+
+    tokens_per_expert, expert_scores = zip(*get_load_balancing_loss())
+    num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        num_layers_per_pipeline_stage = args.num_layers_per_virtual_pipeline_stage
+
+    if len(tokens_per_expert) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} token_per_experts "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+    if len(expert_scores) != num_layers_per_pipeline_stage:
+        raise ValueError(
+            f"Expected {num_layers_per_pipeline_stage} expert_scores "
+            f"but found {len(tokens_per_expert)}.\nnum_layers = "
+            f"{args.num_layers}\npipeline_model_parallel_size = "
+            f"{args.pipeline_model_parallel_size}\n"
+            "num_layers_per_virtual_pipeline_stage"
+            f" = {args.num_layers_per_virtual_pipeline_stage}",
+        )
+
+    # Verify the shape of the tokens_per_expert and expert_scores tensors.
+    assert all(
+        (x.ndim == 1 and x.numel() == args.moe_num_experts for x in tokens_per_expert)
+    )
+
+    tokens = expert_scores[0].shape[0]
+    assert all(
+        (
+            (
+                x.ndim == 2
+                and x.shape[1] == args.moe_num_experts
+                and x.shape[0] == tokens
+            )
+            for x in expert_scores
+        )
+    )
+
+    # Concatenate the contributions of each layer and convert to
+    # the correct types and formats for the dot product.
+    expert_scores = torch.cat(expert_scores, dim=1)
+    if args.moe_lbl_in_fp32:
+        expert_scores = expert_scores.float()
+    if tokens != 0:
+        expert_scores = expert_scores.mean(dim=0)
+    else:
+        expert_scores = expert_scores.sum(dim=0)
+    tokens_per_expert = torch.cat(tokens_per_expert).to(expert_scores.dtype)
+
+    expected_values = num_layers_per_pipeline_stage * args.moe_num_experts
+    assert tokens_per_expert.numel() == expected_values
+    assert expert_scores.numel() == expected_values
+
+    # Calculate the total scale across all factors.
+    #
+    # loss_weight * num_experts / (num_layers * tokens * top_k)
+    scale_numerator = args.moe_num_experts * args.moe_loss_weight
+    scale_denominator = args.num_layers * tokens * args.moe_top_k
+    scale = scale_numerator / scale_denominator
+    return scale * torch.dot(tokens_per_expert, expert_scores)
+
+
+# Calculate the expert capacity based on tokens, top_k, number of experts,
+# expert parallel group, capacity factor, and whether expert model parallelism is used.
+def expert_capacity(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: int,
+    moe_capacity_factor: float,
+    moe_expert_model_parallelism: bool,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def load_balancing_loss(
+    tokens_per_expert: torch.Tensor,
+    expert_scores: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+):
+    assert len(expert_scores.size()) == 2
+    tokens, num_experts = expert_scores.size()
+    assert num_experts == num_experts
+    assert len(tokens_per_expert.size()) == 1
+    (num_experts,) = tokens_per_expert.size()
+    assert num_experts == num_experts
+    scale = num_experts / (tokens * top_k)
+    return scale * torch.dot(
+        tokens_per_expert.to(expert_scores.dtype),
+        expert_scores.mean(dim=0),
+    )
+
+
+def indices_and_bins(
+    top_expert: torch.Tensor,
+    sort_end_bit: int,
+    num_experts: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    top_expert = top_expert.int()
+
+    # Ensure contiguous memory layout
+    top_expert = top_expert.contiguous()
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(top_expert.device):
+        output = ops.sort(top_expert, sort_end_bit)
+        bin_ids, indices = output
+        tokens_per_expert = ops.histogram(top_expert, num_experts)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+    bins = bins.view(1) if not len(bins.size()) else bins
+    return indices, bin_ids, bins, tokens_per_expert
+
+
+def expert_capacity_fn(
+    tokens: int,
+    top_k: int,
+    num_experts: int,
+    expert_parallel_group: torch.distributed.ProcessGroup,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+) -> int:
+    world_size = (
+        dist.get_world_size(expert_parallel_group)
+        if moe_expert_model_parallelism
+        else 1
+    )
+    tokens_per_expert = top_k * tokens * world_size / num_experts
+    return int(moe_capacity_factor * tokens_per_expert)
+
+
+def permute_and_compute(
+    x,
+    tokens_per_expert,
+    indices,
+    bin_ids,
+    expert_weights,
+    bins,
+    expert_capacity,
+    top_k,
+    w1,
+    w2,
+    w1_bias,
+    w2_bias,
+    gradient_scale,
+    alpha,
+):
+    # Route tokens to experts
+    x = x.view(-1, x.shape[-1])
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+
+    # Expert computation
+    x = mlp_forward(x, w1, w2, w1_bias, w2_bias, gradient_scale, alpha)
+
+    # Ensure CUB knows which device to use
+    with torch.cuda.device(x.device):
+        # Route tokens back
+        out = ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+    return out
+
+
+def forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: int = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
+):
+    # x: [sl, bs, hs]
+    # expert_weights: [sl * bs, top-k]
+    # top_experts: [sl * bs, top-k]
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    with torch.no_grad():
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate expert capacity
+        sl, bs, _ = x.size()
+
+        expert_capacity = expert_capacity_fn(
+            sl * bs,
+            top_k,
+            num_experts,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+
+        if expert_capacity == 0:
+            expert_capacity = torch.max(tokens_per_expert).item()
+
+    x = permute_and_compute(
+        x,
+        tokens_per_expert,
+        indices,
+        bin_ids,
+        expert_weights,
+        bins,
+        expert_capacity,
+        top_k,
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        gradient_scale,
+        alpha,
+    )
+    return x, tokens_per_expert
+
+
+def parallel_forward_once(
+    x: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_experts: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_bias: torch.Tensor,
+    w2_bias: torch.Tensor,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    top_k: int = 4,
+    num_experts: int = 128,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = True,
+    hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
+):
+    # Flatten inputs
+    expert_weights = expert_weights.flatten()
+    top_experts = top_experts.flatten()
+
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
+
+    with torch.no_grad():
+        # Step 1: Local permutation setup
+        indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
+            top_experts, sort_end_bit, num_experts
+        )
+
+        # Calculate sharding parameters
+        world_size = dist.get_world_size(expert_parallel_group)
+        hidden_sharding_deg = hidden_sharding_degree(
+            world_size, num_experts, hidden_size
+        )
+        experts_per_rank_val = experts_per_rank(num_experts, world_size)
+
+        # Replicate token counts for hidden sharding
+        repeated_tokens_per_expert = ops.repeat(
+            tokens_per_expert, (hidden_sharding_deg,)
+        )
+
+        # Exchange token counts across devices
+        parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
+
+        # Ensure CUB knows which device to use
+        tpe_handle = dist.all_to_all_single(
+            parallel_tokens_per_expert,
+            repeated_tokens_per_expert,
+            group=expert_parallel_group,
+            async_op=True,
+        )
+
+    # Step 2: Local permutation - group tokens by target device
+    x = x.view(-1, x.shape[-1])  # [sl * bs, hs]
+    x = ops.gather(x, indices, bin_ids, bins, top_k)
+
+    # Step 3: Compute communication counts and exchange tokens
+    with torch.no_grad():
+        tpe_handle.wait()
+
+        # Reshape for per-device calculations
+        repeated_tokens_per_expert = repeated_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+        parallel_tokens_per_expert = parallel_tokens_per_expert.view(
+            world_size, experts_per_rank_val
+        )
+
+        # Calculate send/recv counts
+        send_counts = repeated_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        # recv_counts = parallel_tokens_per_expert.cpu().sum(dim=-1).tolist()
+        parallel_tokens_per_expert_cpu = parallel_tokens_per_expert.cpu()
+        recv_counts = parallel_tokens_per_expert_cpu.sum(dim=-1).tolist()
+        tokens_received = sum(recv_counts)
+
+    # Replicate for hidden sharding
+    x = ops.repeat(x, (hidden_sharding_deg, 1))
+
+    # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
+    )
+
+    with torch.no_grad():
+        # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
+        replicate_bins = (
+            replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
+        )
+
+        # Create expert indices for received tokens
+        parallel_top_expert = torch.remainder(
+            torch.arange(
+                num_experts * hidden_sharding_deg,
+                dtype=torch.int32,
+                device=indices.device,
+            ),
+            experts_per_rank_val,
+        )
+        parallel_top_expert = ops.replicate(
+            parallel_top_expert.unsqueeze(dim=0),
+            replicate_bins,
+            tokens_received,
+        ).flatten()
+
+        # Sort tokens by expert assignment
+        parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
+            sort_end_bit,
+        )
+
+        # Calculate bins for local experts
+        parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
+            dim=0, dtype=torch.int
+        )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+        parallel_bins = (
+            parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
+        )
+
+        # Calculate expert capacity
+        expert_capacity = expert_capacity_fn(
+            tokens_received,
+            top_k,
+            experts_per_rank_val,
+            expert_parallel_group,
+            moe_capacity_factor,
+            moe_expert_model_parallelism,
+        )
+        if expert_capacity == 0:
+            expert_capacity = torch.max(parallel_tokens_per_expert).item()
+
+    # Locally permute the tokens and perform the expert computation.
+    # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
+        # GroupedMLP requires counts on CPU. We can use the tensor already
+        # moved to CPU for the prior all_to_all, which avoids an extra
+        # device synchronization.
+        parallel_tokens_per_expert = parallel_tokens_per_expert_cpu.sum(
+            dim=0,
+            dtype=torch.int,
+        )
+
+    # Step 5: Expert computation
+    parallel_x_handle.wait()
+
+    parallel_x = permute_and_compute(
+        parallel_x,
+        parallel_tokens_per_expert,
+        parallel_indices,
+        parallel_bin_ids,
+        None,  # expert_weights
+        parallel_bins,
+        expert_capacity,
+        top_k=1,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+    )
+
+    # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
+
+    # Step 7: Reduce across hidden sharding dimension
+    shape = (hidden_sharding_deg, -1, hidden_size)
+    x = x.view(shape).sum(dim=0)
+
+    # Step 8: Final local unpermutation
+    x = ops.scatter(x, indices, bin_ids, expert_weights, bins, top_k)
+
+    return x, tokens_per_expert.flatten()
+
+
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        router_bias,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+
+    in_shape = x.size()
+
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+
+    # Restore original shape
+    x = x.view(in_shape)
+
+    return x, expert_weights, router_scores
+
+
+def moe_forward_with_shared_expert(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: Optional[torch.Tensor],
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+    # Shared expert parameters
+    shared_up_proj_weight: Optional[torch.Tensor] = None,
+    shared_down_proj_weight: Optional[torch.Tensor] = None,
+    shared_up_proj_bias: Optional[torch.Tensor] = None,
+    shared_down_proj_bias: Optional[torch.Tensor] = None,
+    shared_expert_weighted_sum: bool = False,
+    shared_activation_fn: Optional[Any] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    # First, compute regular MoE forward pass
+    expert_out, expert_weights, router_scores = moe_forward(
+        x=x,
+        router_weight=router_weight,
+        router_bias=router_bias,
+        moe_top_k=moe_top_k,
+        moe_num_experts=moe_num_experts,
+        moe_jitter_eps=moe_jitter_eps,
+        moe_normalize_expert_weights=moe_normalize_expert_weights,
+        uniform_expert_assignment=uniform_expert_assignment,
+        training=training,
+        w1=w1,
+        w2=w2,
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        gradient_scale=gradient_scale,
+        alpha=alpha,
+        sort_end_bit=sort_end_bit,
+        expert_parallel_group=expert_parallel_group,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_expert_model_parallelism=moe_expert_model_parallelism,
+        forward_fn=forward_fn,
+        hidden_size=hidden_size,
+        mlp_impl=mlp_impl,
+    )
+
+    # If shared expert weights provided, compute shared expert output
+    if shared_up_proj_weight is not None and shared_down_proj_weight is not None:
+        shared_expert_out = shared_mlp_forward(
+            x=x,
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            activation_fn=shared_activation_fn,
+            gradient_scale=gradient_scale,
+        )
+
+        # Combine expert outputs
+        combined_out = combine_expert_shared_outputs(
+            shared_expert_out=shared_expert_out,
+            expert_out=expert_out,
+            shared_expert_weighted_sum=shared_expert_weighted_sum,
+            moe_top_k=moe_top_k,
+        )
+
+        return combined_out, expert_weights, router_scores
+
+    # Return regular MoE output if no shared expert
+    return expert_out, expert_weights, router_scores
+
+
+def create_shared_expert_weights(
+    hidden_size: int,
+    shared_expert_hidden_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    init_method: Any,
+    output_layer_init_method: Any = None,
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+
+    if output_layer_init_method is None:
+        output_layer_init_method = init_method
+
+    # Create weight tensors
+    up_proj_weight = torch.empty(
+        shared_expert_hidden_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+    down_proj_weight = torch.empty(
+        hidden_size,
+        shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+    )
+
+    # Initialize weights
+    init_method(up_proj_weight)
+    output_layer_init_method(down_proj_weight)
+
+    # No bias by default
+    return up_proj_weight, down_proj_weight, None, None
+
+
+# HACK: Extract device_mesh from pre-hook closure - required for transformers integration
+# This exists because device_mesh is trapped in hook closures with no model attribute
+# Fragile - breaks if hook structure changes or Python internals change
+# TODO: Replace with a more robust solution when available
+def get_device_mesh(model):
+    # Extract device_mesh from child's unused pre_hook closure
+    try:
+        # Find the pre-hook that contains 'device_mesh' in its closure
+        hook = next(
+            h
+            for h in model.experts._forward_pre_hooks.values()
+            if "device_mesh" in h.__code__.co_freevars
+        )
+        # Extract the device_mesh from the closure
+        return hook.__closure__[
+            hook.__code__.co_freevars.index("device_mesh")
+        ].cell_contents
+    except Exception:
+        return None
+
+
+class MegaBlocksMoeMLP(torch.nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+        output, expert_weights_out, *_ = moe_forward(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+        )
+        return output, expert_weights_out
+
+
+# Export main classes
+__all__ = ["MegaBlocksMoeMLP", "MegaBlocksMoeMLPWithSharedExpert"]
+
+
+class MegaBlocksMoeMLPWithSharedExpert(MegaBlocksMoeMLP):
+
+    def __init__(self):
+        super().__init__()
+        # Shared expert weights will be set by the user
+        self.shared_up_proj_weight = None
+        self.shared_down_proj_weight = None
+        self.shared_up_proj_bias = None
+        self.shared_down_proj_bias = None
+        self.shared_expert_weighted_sum = False
+        self.shared_activation_fn = None
+
+    def set_shared_expert_weights(
+        self,
+        up_proj_weight: torch.Tensor,
+        down_proj_weight: torch.Tensor,
+        up_proj_bias: Optional[torch.Tensor] = None,
+        down_proj_bias: Optional[torch.Tensor] = None,
+        weighted_sum: bool = False,
+        activation_fn: Optional[Any] = None,
+    ):
+        self.shared_up_proj_weight = up_proj_weight
+        self.shared_down_proj_weight = down_proj_weight
+        self.shared_up_proj_bias = up_proj_bias
+        self.shared_down_proj_bias = down_proj_bias
+        self.shared_expert_weighted_sum = weighted_sum
+        self.shared_activation_fn = activation_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self.router, "top_k", 4)
+        moe_num_experts = getattr(self.experts, "num_experts", 128)
+        gradient_scale = getattr(self.experts, "gradient_scale", None)
+        alpha = getattr(self.experts, "alpha", 1.0)
+        moe_capacity_factor = getattr(self.experts, "capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self.experts, "jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self.experts, "normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        if expert_parallel_group is None:
+            device_mesh = get_device_mesh(self)
+            expert_parallel_group = device_mesh.get_group() if device_mesh else None
+
+        has_parallel = (
+            expert_parallel_group is not None
+            and dist.is_initialized()
+            and dist.get_world_size(expert_parallel_group) > 1
+        )
+        forward_fn = parallel_forward_once if has_parallel else forward_once
+
+        sort_end_bit = max(
+            int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
+        )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")
+
+        output, expert_weights_out, *_ = moe_forward_with_shared_expert(
+            x=x,
+            router_weight=self.router.weight,
+            router_bias=self.router.bias,
+            moe_top_k=moe_top_k,
+            moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
+            sort_end_bit=sort_end_bit,
+            expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
+            forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
+            # Shared expert parameters
+            shared_up_proj_weight=self.shared_up_proj_weight,
+            shared_down_proj_weight=self.shared_down_proj_weight,
+            shared_up_proj_bias=self.shared_up_proj_bias,
+            shared_down_proj_bias=self.shared_down_proj_bias,
+            shared_expert_weighted_sum=self.shared_expert_weighted_sum,
+            shared_activation_fn=self.shared_activation_fn,
+        )
+        return output, expert_weights_out
diff --git a/torch-ext/megablocks/ops/__init__.py b/torch-ext/megablocks/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b944080df810d0b0cfc571f3009b0098a651f9b7
--- /dev/null
+++ b/torch-ext/megablocks/ops/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
+
+__all__ = [
+    'binned_gather',
+    'binned_scatter',
+    'exclusive_cumsum',
+    'inclusive_cumsum',
+    'gather',
+    'histogram',
+    'padded_gather',
+    'padded_scatter',
+    'repeat',
+    'replicate',
+    'round_up',
+    'scatter',
+    'sort',
+    'sum',
+    'topology',
+]
diff --git a/torch-ext/megablocks/ops/all_to_all_benchmark.py b/torch-ext/megablocks/ops/all_to_all_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c939818edca3345f6344bbc7cef07ffe3cd0181
--- /dev/null
+++ b/torch-ext/megablocks/ops/all_to_all_benchmark.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.distributed as dist
+
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
+
+_ALL_TO_ALL_BENCHMARK = (
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
+)
+
+
+def benchmark_all_to_all(group, sl, hs):
+    world_size = dist.get_world_size(group)
+    assert (sl % world_size) == 0
+    send_recv_sizes = [sl // world_size] * world_size
+
+    x = torch.randn((sl, hs)).cuda().half()
+
+    details = {
+        'world_size': world_size,
+        'message_size (B)': send_recv_sizes[0] * hs * 2,  # 2B elements.
+    }
+
+    def benchmark():
+        return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+
+    time, std = benchmark_util.benchmark_function(benchmark)
+
+    if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
+
+
+if __name__ == '__main__':
+    assert dist.is_available()
+    group = dist.init_process_group(backend='nccl')
+    local_rank = dist.get_rank(group)
+    torch.cuda.set_device(local_rank)
+
+    for args in _ALL_TO_ALL_BENCHMARK:
+        benchmark_all_to_all(group, *args)
diff --git a/torch-ext/megablocks/ops/all_to_all_benchmark.sh b/torch-ext/megablocks/ops/all_to_all_benchmark.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b4ff4659c7c698c8672a631c297d434b136becb9
--- /dev/null
+++ b/torch-ext/megablocks/ops/all_to_all_benchmark.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+DISTRIBUTED_ARGUMENTS="\
+--nproc_per_node 8 \
+--nnodes 1 \
+--node_rank 0 \
+--master_addr localhost \
+--master_port 6000"
+
+python -m torch.distributed.launch \
+       ${DISTRIBUTED_ARGUMENTS} \
+       megablocks/ops/all_to_all_benchmark.py
diff --git a/torch-ext/megablocks/ops/binned_gather.py b/torch-ext/megablocks/ops/binned_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..189a7fa3518d660f29ea32e7a04827164af98d60
--- /dev/null
+++ b/torch-ext/megablocks/ops/binned_gather.py
@@ -0,0 +1,37 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_gather kernel.
+class BinnedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        bin_size: int,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bins)
+        ctx.top_k = top_k
+        return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        indices, bins = ctx.saved_tensors
+        out = kernels.binned_scatter(grad, indices, None, bins, ctx.top_k)
+        return out, None, None, None, None
+
+
+binned_gather = BinnedGatherOp.apply
diff --git a/torch-ext/megablocks/ops/binned_scatter.py b/torch-ext/megablocks/ops/binned_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb937c0c106662ce8108c1cb926f8f063b163d3d
--- /dev/null
+++ b/torch-ext/megablocks/ops/binned_scatter.py
@@ -0,0 +1,59 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for binned_scatter kernel.
+class BinnedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        assert len(x.size()) == 3
+        ctx.bin_size = x.size(1)
+        ctx.top_k = top_k
+
+        # TODO(tgale): Don't save 'x' for backwards if we don't need to
+        # calculate the gradient w.r.t. 'weights'.
+        ctx.save_for_backward(x, indices, weights, bins)
+        return kernels.binned_scatter(x, indices, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        x, indices, weights, bins = ctx.saved_tensors
+        out = kernels.binned_gather(
+            grad,
+            indices,
+            weights,
+            bins,
+            ctx.bin_size,
+            ctx.top_k,
+        )
+
+        wgrad = None
+        if ctx.needs_input_grad[2]:
+            wgrad = kernels.binned_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bins,
+                ctx.top_k,
+            )
+        return out, None, wgrad, None, None
+
+
+binned_scatter = BinnedScatterOp.apply
diff --git a/torch-ext/megablocks/ops/cumsum.py b/torch-ext/megablocks/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7572391e20045d335cf7337246e8a9b9f57ef
--- /dev/null
+++ b/torch-ext/megablocks/ops/cumsum.py
@@ -0,0 +1,52 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrappers for cumsum kernels.
+# NOTE: Does not support gradients.
+class ExclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int):
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.exclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.exclusive_cumsum(x, dim, out)
+        return out
+
+
+exclusive_cumsum = ExclusiveCumsumOp.apply
+
+
+class InclusiveCumsumOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, dim: int) -> torch.Tensor:
+        if len(x.size()) == 1:
+            x = x.view([1, -1])
+            out = torch.empty_like(x)
+            ops.inclusive_cumsum(x, 1, out)
+            return out.squeeze()
+        out = torch.empty_like(x)
+        ops.inclusive_cumsum(x, dim, out)
+        return out
+
+
+inclusive_cumsum = InclusiveCumsumOp.apply
diff --git a/torch-ext/megablocks/ops/gather.py b/torch-ext/megablocks/ops/gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f87c1e7bed8d3589dd790805234976e0b05898
--- /dev/null
+++ b/torch-ext/megablocks/ops/gather.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for gather kernel.
+class GatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins)
+        ctx.top_k = top_k
+        return kernels.gather(x, indices, bin_ids, None, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins = ctx.saved_tensors
+        out = kernels.scatter(grad, indices, bin_ids, None, bins, ctx.top_k)
+        return out, None, None, None, None, None
+
+
+gather = GatherOp.apply
diff --git a/torch-ext/megablocks/ops/histogram.py b/torch-ext/megablocks/ops/histogram.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f058ec373cbba7555704fb5e4212c3cc75d9d
--- /dev/null
+++ b/torch-ext/megablocks/ops/histogram.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for histogram kernel.
+# NOTE: Does not support gradients.
+class HistogramOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, max_val: float):
+        return ops.histogram(x, max_val)
+
+
+histogram = HistogramOp.apply
diff --git a/torch-ext/megablocks/ops/histogram_benchmark.py b/torch-ext/megablocks/ops/histogram_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57b7bf8228e01237236748147368b09ffdf8072
--- /dev/null
+++ b/torch-ext/megablocks/ops/histogram_benchmark.py
@@ -0,0 +1,78 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_HISTOGRAM_TESTS = (
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 4),
+    (16384, torch.int32, 8),
+    (16384, torch.int32, 16),
+    (16384, torch.int32, 32),
+    (16384, torch.int32, 64),
+    (16384, torch.int32, 128),
+    (16384, torch.int32, 256),
+)
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class HistogramBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_HISTOGRAM_TESTS)
+    def testTorchHistogram(self, n, dtype, max_val):
+        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/ops/matmul_benchmark.py b/torch-ext/megablocks/ops/matmul_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ccc5dcec5e9a663794fad944c45285869c4d1c1
--- /dev/null
+++ b/torch-ext/megablocks/ops/matmul_benchmark.py
@@ -0,0 +1,415 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+# import stk
+
+# try:
+#     import stk
+# except ImportError:
+#     import warnings
+#     warnings.warn(
+#         'Please add `stanford-stk` if megablocks/ops/matmul_benchmark.py is needed.',
+#     )
+
+from .. import stk
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+
+# Calling tensor.t() calls tensor.transpose(0, 1) which calls
+# torch.as_strided(...). Circumvent this chain to avoid an overhead
+# this adds.
+def transpose_view(x):
+    return torch.as_strided(
+        x,
+        (x.shape[1], x.shape[0]),
+        (x.stride()[1], x.stride()[0]),
+    )
+
+
+_MATMUL_TESTS = (
+    (64 * 1024, 512, 2048, 64),
+    (32 * 1024, 768, 3072, 64),
+    (8 * 1024, 1024, 4096, 64),
+    (4 * 2048, 4096, 4 * 4096, 4),
+)
+
+
+def log_benchmark(name, arguments, time, std, flops):
+    benchmark_util.log_benchmark(name, arguments, time, std)
+    print('flops = {:.2f}B'.format(flops / 1e9))
+    print('throughput = {:.2f}T'.format(flops / 1e9 / time))
+    print('=' * 60)
+
+
+class MatmulBenchmark(parameterized.TestCase):
+
+    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+        blocking = 128
+        padded_tokens, _ = x.size()
+        assert padded_tokens % blocking == 0
+        assert fhs % blocking == 0
+
+        # Offsets for the sparse matrix. All rows have the
+        # same number of nonzero blocks dictated by the
+        # dimensionality of a single expert.
+        block_rows = padded_tokens // blocking
+        blocks_per_row = fhs // blocking
+        offsets = torch.arange(
+            0,
+            block_rows * blocks_per_row + 1,
+            blocks_per_row,
+            dtype=torch.int32,
+            device=x.device,
+        )
+
+        # Indices for the sparse matrix. The indices for
+        # the intermediate matrix are dynamic depending
+        # on the mapping of tokens to experts.
+        column_indices = ops.topology(
+            padded_bins,
+            blocking,
+            block_rows,
+            blocks_per_row,
+        )
+        data = torch.empty(
+            column_indices.numel(),
+            blocking,
+            blocking,
+            dtype=torch.float16,
+            device=x.device,
+        )
+        shape = (padded_tokens, fhs * ne)
+        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+
+    def build_input_matrix(self, sl, hs, ne):
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Assign tokens to experts uniformly.
+        top_expert = torch.arange(0, sl).cuda().int() % ne
+
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+        return out, padded_bins
+
+    def build_weight_matrix(self, ne, hs, fhs):
+        return torch.randn((hs, ne * fhs)).cuda().half()
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(x, w, topo)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(topo, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradX::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        topo = topo.t()
+
+        def benchmark():
+            return stk.ops.dsd(topo, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+
+        def benchmark():
+            return stk.ops.dsd(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DSD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        w = transpose_view(w)
+
+        def benchmark():
+            return stk.ops.sdd(out, w, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::SDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+        x, padded_bins = self.build_input_matrix(sl, hs, ne)
+        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+        out = stk.ops.dsd(x, w)
+        x = x.t()
+
+        def benchmark():
+            return stk.ops.dsd(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DSD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.nnz * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+
+        w = w.transpose(1, 2).contiguous()
+        w = w.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0::Fwd:DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = w.transpose(1, 2).contiguous()
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradX:DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, hs)).cuda().half()
+        w = torch.randn((ne, hs, fhs)).cuda().half()
+        out = torch.bmm(x, w)
+        out = out.transpose(1, 2)
+
+        def benchmark():
+            return torch.bmm(out, x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '0:GradW:DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * fhs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+
+        def benchmark():
+            return torch.bmm(x, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::Fwd::DDD::NN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        w = torch.transpose(w, 1, 2)
+
+        def benchmark():
+            return torch.bmm(out, w)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradX::DDD::NT',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+    @parameterized.parameters(*_MATMUL_TESTS)
+    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+        assert (sl % ne) == 0
+        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+        w = torch.randn((ne, fhs, hs)).cuda().half()
+        out = torch.bmm(x, w)
+        x = torch.transpose(x, 1, 2)
+
+        def benchmark():
+            return torch.bmm(x, out)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'ffn_hidden_size': fhs,
+            'num_experts': ne,
+        }
+        log_benchmark(
+            '1::GradW::DDD::TN',
+            arguments,
+            mean_t,
+            std_t,
+            x.numel() * hs * 2,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/ops/padded_gather.py b/torch-ext/megablocks/ops/padded_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cf4047c9494394d2a3884ba8830179013db7ff
--- /dev/null
+++ b/torch-ext/megablocks/ops/padded_gather.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_gather kernel.
+class PaddedGatherOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        ctx.save_for_backward(indices, bin_ids, bins, padded_bins)
+        ctx.top_k = top_k
+        return kernels.padded_gather(
+            x,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+
+        indices, bin_ids, bins, padded_bins = ctx.saved_tensors
+        out = kernels.padded_scatter(
+            grad,
+            indices,
+            bin_ids,
+            None,
+            bins,
+            padded_bins,
+            ctx.top_k,
+        )
+        return out, None, None, None, None, None
+
+
+padded_gather = PaddedGatherOp.apply
diff --git a/torch-ext/megablocks/ops/padded_scatter.py b/torch-ext/megablocks/ops/padded_scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e021b81497e472cda5d72bdac557a0ca92d262
--- /dev/null
+++ b/torch-ext/megablocks/ops/padded_scatter.py
@@ -0,0 +1,98 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for padded_scatter kernel.
+class PaddedScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            *maybe_x,
+        )
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.padded_scatter(
+            x,
+            indices,
+            bin_ids,
+            weights,
+            bins,
+            padded_bins,
+            top_k,
+        )
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins, padded_bins = saved_tensors[:5]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.padded_gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.padded_scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                padded_bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None, None
+
+
+def padded_scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    padded_bins: torch.Tensor,
+    top_k: int,
+):
+    return PaddedScatterOp.apply(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
diff --git a/torch-ext/megablocks/ops/padded_scatter_benchmark.py b/torch-ext/megablocks/ops/padded_scatter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c575cfe7487d346ba9ec18bbb7ef17f2eb77ec51
--- /dev/null
+++ b/torch-ext/megablocks/ops/padded_scatter_benchmark.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PADDED_SCATTER_BENCHMARK = (
+    # dMoE-Medium, 8-way EMP.
+    (1024 * 16, 1024, 8, 4),
+    # dMoE-Medium, post-all-to-all.
+    (1024 * 16 * 4, 1024, 8, 1),
+)
+
+
+class PaddedScatterTest(parameterized.TestCase):
+
+    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+    def testPaddedScatter(self, sl, hs, ne, top_k):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        # Sample weights for the scatter reduce.
+        weights = torch.rand((sl * top_k,)).cuda().half()
+
+        # Gather the data to prepare for backwards.
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+
+        def benchmark():
+            return ops.padded_scatter(
+                x,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                padded_bins,
+                top_k,
+            )
+
+        time, std = benchmark_util.benchmark_function(benchmark)
+        benchmark_util.log_benchmark(
+            'Padded Scatter',
+            {
+                'sequence_length': sl,
+                'hidden_size': hs,
+                'num_experts': ne,
+                'top_k': top_k,
+            },
+            time,
+            std,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/ops/permute_benchmark.py b/torch-ext/megablocks/ops/permute_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..6536eeeae402659a087e5c51ef9840627af56501
--- /dev/null
+++ b/torch-ext/megablocks/ops/permute_benchmark.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import torch
+from absl.testing import parameterized
+
+from .. import benchmark_util, ops
+
+_PERMUTE_TESTS = (
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384 * 8, 768, 2),
+    (16384 * 8, 768, 4),
+    (16384 * 8, 768, 8),
+    (16384 * 8, 768, 16),
+    (16384 * 8, 768, 32),
+    (16384 * 8, 768, 64),
+    (16384 * 8, 768, 128),
+)
+
+
+class PermuteBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedGather(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.binned_gather(x, indices, bins, ec)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testBinnedScatter(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(indices, ne)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.binned_gather(x, indices, bins, ec)
+
+        def benchmark():
+            return ops.binned_scatter(x, indices, bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedGather(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+
+        def benchmark():
+            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testPaddedScatter(self, sl, hs, ne):
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+
+        # Randomly assign tokens to experts.
+        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+        bin_ids, indices = ops.sort(top_expert)
+        tokens_per_expert = ops.histogram(top_expert, ne)
+        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+
+        def benchmark():
+            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_PERMUTE_TESTS)
+    def testCopy(self, sl, hs, ne):
+        # NOTE: Capacity factor == 1.
+        # ec = sl // ne
+
+        # Create the data and indices.
+        x = torch.randn((sl, hs)).cuda().half()
+        y = x.clone()
+
+        def benchmark():
+            return y.copy_(x)
+
+        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+        arguments = {
+            'sequence_length': sl,
+            'hidden_size': hs,
+            'num_experts': ne,
+        }
+        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/ops/repeat.py b/torch-ext/megablocks/ops/repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9e09de5f857d51cd758ab30b2f3a846d6f9275
--- /dev/null
+++ b/torch-ext/megablocks/ops/repeat.py
@@ -0,0 +1,10 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def repeat(x: torch.Tensor, tiling: torch.Size):
+    if all((t == 1 for t in tiling)):
+        return x
+    return x.repeat(*tiling)
diff --git a/torch-ext/megablocks/ops/replicate.py b/torch-ext/megablocks/ops/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..26daf0eede330603a4b8ea7167faf1411d07ca93
--- /dev/null
+++ b/torch-ext/megablocks/ops/replicate.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for replicate kernel.
+class ReplicateOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        ctx.save_for_backward(bins)
+        out = torch.empty((x.shape[0], num_outputs), dtype=x.dtype, device=x.device)
+        ops.replicate_forward(x, bins, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad: torch.Tensor):
+        bins, = ctx.saved_tensors
+        out = torch.empty((grad.shape[0], bins.shape[0]), dtype=grad.dtype, device=grad.device)
+        ops.replicate_backward(grad, bins, out)
+        return out, None, None
+
+
+replicate = ReplicateOp.apply
diff --git a/torch-ext/megablocks/ops/round_up.py b/torch-ext/megablocks/ops/round_up.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6bc873c9f448c5fa9126ebcfd66e8688002af
--- /dev/null
+++ b/torch-ext/megablocks/ops/round_up.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+
+def round_up(x: torch.Tensor, value: int):
+    assert isinstance(value, int)
+    assert x.dtype == torch.int32
+
+    # TODO(tgale): If this becomes and issue
+    # do this in a custom kernel. We only expect
+    # to use this on arrays of less than 1k elements.
+    return torch.div(x + (value - 1), value, rounding_mode='trunc') * value
diff --git a/torch-ext/megablocks/ops/scatter.py b/torch-ext/megablocks/ops/scatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4605d9b46f387761b070352365f223dbfe69d47
--- /dev/null
+++ b/torch-ext/megablocks/ops/scatter.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import torch
+from .stk_autocast import custom_bwd, custom_fwd
+
+from ..backend import kernels
+
+
+# Autograd wrapper for scatter kernel.
+class ScatterOp(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ) -> torch.Tensor:
+        maybe_x = [x] if ctx.needs_input_grad[3] else []
+        ctx.save_for_backward(indices, bin_ids, weights, bins, *maybe_x)
+        ctx.top_k = top_k
+        ctx.x_shape = x.shape
+        return kernels.scatter(x, indices, bin_ids, weights, bins, top_k)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, grad: torch.Tensor):
+        grad = grad.contiguous()
+        saved_tensors = ctx.saved_tensors
+
+        indices, bin_ids, weights, bins = saved_tensors[:4]
+        dgrad = None
+        if ctx.needs_input_grad[0]:
+            dgrad = kernels.gather(
+                grad,
+                indices,
+                bin_ids,
+                weights,
+                bins,
+                ctx.top_k,
+            )
+
+        wgrad = None
+        if ctx.needs_input_grad[3]:  # need wgrad
+            x = saved_tensors[-1]
+            wgrad = kernels.scatter_wgrad(
+                x,
+                grad,
+                indices,
+                bin_ids,
+                bins,
+                ctx.top_k,
+            )
+        return dgrad, None, None, wgrad, None, None, None
+
+
+def scatter(
+    x: torch.Tensor,
+    indices: torch.Tensor,
+    bin_ids: torch.Tensor,
+    weights: torch.Tensor,
+    bins: torch.Tensor,
+    top_k: int,
+) -> Optional[torch.Tensor]:
+    return ScatterOp.apply(x, indices, bin_ids, weights, bins, top_k)
diff --git a/torch-ext/megablocks/ops/sort.py b/torch-ext/megablocks/ops/sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda3bf64283e39533c2eae3627e76bb2d0262c9f
--- /dev/null
+++ b/torch-ext/megablocks/ops/sort.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional, Tuple
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+_BITS_FOR_DTYPE = {
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.int64: 64,
+}
+
+
+# Autograd wrapper for sort kernel.
+# NOTE: Does not support gradients.
+class SortOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, x: torch.Tensor, end_bit: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if end_bit is None:
+            end_bit = _BITS_FOR_DTYPE[x.dtype]
+        x_out = torch.empty_like(x)
+        iota_out = torch.empty_like(x)
+        ops.sort(x, end_bit, x_out, iota_out)
+        return (x_out, iota_out)
+
+
+sort = SortOp.apply
diff --git a/torch-ext/megablocks/ops/sort_benchmark.py b/torch-ext/megablocks/ops/sort_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff957d4c552c6e61d9279a7989795472af7b7
--- /dev/null
+++ b/torch-ext/megablocks/ops/sort_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+from .. import ops
+
+_SORT_TESTS = (
+    (16384, torch.int32, None),
+    (16384, torch.int32, 2),
+    (16384, torch.int32, 128),
+)
+
+_BASELINE_SORT_TESTS = ((16384,),)
+
+
+def numpy_dtype(dtype):
+    types = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+
+
+def benchmark_function(fn, iterations=10):
+    # Run once to get rid of startup overhead.
+    fn()
+    times = []
+    for _ in range(iterations):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        fn()
+        end.record()
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+    times = np.array(times)
+    return times.mean(), times.std(), times.max(), times.min()
+
+
+def log_benchmark(arguments, mean_t, std_t):
+    print('=' * 60)
+    print('Benchmark Parameters:')
+    for (key, value) in arguments.items():
+        print(f'{key} = {value}')
+    print('Results:')
+    print('mean / std = {:.2f}ms / {:.2f}ms'.format(mean_t, std_t))
+    print('=' * 60)
+
+
+class SortBenchmark(parameterized.TestCase):
+
+    @parameterized.parameters(*_SORT_TESTS)
+    def testSort(self, n, dtype, max_val):
+        if max_val is None:
+            max_val = np.iinfo(numpy_dtype(dtype)).max
+        end_bit = int(np.ceil(np.log2(max_val)))
+        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+        arguments = {
+            'n': n,
+            'dtype': dtype,
+            'max_val': max_val,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+    @parameterized.parameters(*_BASELINE_SORT_TESTS)
+    def testTorchSort(self, n):
+        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+
+        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+        arguments = {
+            'n': n,
+        }
+        log_benchmark(arguments, mean_t, std_t)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/ops/stk_autocast.py b/torch-ext/megablocks/ops/stk_autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3626e5e0eec51339c95a448bca84be14a2ca93
--- /dev/null
+++ b/torch-ext/megablocks/ops/stk_autocast.py
@@ -0,0 +1,39 @@
+# vendored from
+# https://github.com/stanford-futuredata/stk/blob/736313768ef697ce13a0594a41b2512a0fbc9884/stk/backend/autocast.py
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
\ No newline at end of file
diff --git a/torch-ext/megablocks/ops/sum.py b/torch-ext/megablocks/ops/sum.py
new file mode 100644
index 0000000000000000000000000000000000000000..e00c1aa68e1193f5b72f75a2edc37de8d505facc
--- /dev/null
+++ b/torch-ext/megablocks/ops/sum.py
@@ -0,0 +1,9 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sum(x: torch.Tensor, dim: int = 0):
+    if x.shape[dim] == 1:
+        return x.squeeze(dim=dim)
+    return x.sum(dim=dim)
diff --git a/torch-ext/megablocks/ops/topology.py b/torch-ext/megablocks/ops/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a50d3164db20534b099dcb4d8487a7aef25d15
--- /dev/null
+++ b/torch-ext/megablocks/ops/topology.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+
+# Wrap this in a try-block with better error message and
+# instructions for building the c++ operations.
+try:
+    from .._ops import ops  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+
+
+# Autograd wrapper for topology kernel.
+# NOTE: Does not support gradients.
+class TopologyOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx: Any,
+        padded_bins: torch.Tensor,
+        block_size: int,
+        output_block_rows: int,
+        output_block_columns: int,
+    ):
+        out = torch.empty(
+            output_block_rows * output_block_columns,
+            dtype=torch.int16,
+            device=padded_bins.device,
+        )
+        ops.indices(
+            padded_bins,
+            block_size,
+            output_block_rows,
+            output_block_columns,
+            out,
+        )
+        return out
+
+
+topology = TopologyOp.apply
diff --git a/torch-ext/megablocks/py.typed b/torch-ext/megablocks/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/torch-ext/megablocks/stk/__init__.py b/torch-ext/megablocks/stk/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c40c267b1c3f4949e9c957a5d2c9f682dfc1a6
--- /dev/null
+++ b/torch-ext/megablocks/stk/__init__.py
@@ -0,0 +1,7 @@
+# import stk.random
+# import stk.ops
+# from stk.matrix import Matrix
+
+from . import random
+from . import ops
+from .matrix import Matrix
diff --git a/torch-ext/megablocks/stk/backend/__init__.py b/torch-ext/megablocks/stk/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/torch-ext/megablocks/stk/backend/autocast.py b/torch-ext/megablocks/stk/backend/autocast.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f6e919a60f3fd579ed0215031008d14111dc96
--- /dev/null
+++ b/torch-ext/megablocks/stk/backend/autocast.py
@@ -0,0 +1,37 @@
+import functools
+import torch
+
+
+def _is_eligible(x):
+    return x.is_floating_point() and x.is_cuda and (x.dtype is not torch.float64)
+
+
+def _cast(x, dtype):
+    if isinstance(x, torch.Tensor) and _is_eligible(x):
+        return x.to(dtype)
+    elif isinstance(x, map):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in x.items()}
+    elif isinstance(x, list) or isinstance(x, tuple):
+        return type(x)(map(lambda y: _cast(y, dtype), x))
+    return x
+
+
+def custom_fwd(fwd):
+    """Wrap a custom autograd function that always uses autocast dtype."""
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        if torch.is_autocast_enabled():
+            with torch.autocast(device_type="cuda", enabled=False):
+                dtype = torch.get_autocast_gpu_dtype()
+                return fwd(*_cast(args, dtype), **_cast(kwargs, dtype))
+        return fwd(*args, **kwargs)
+    return decorate_fwd
+
+
+def custom_bwd(bwd):
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with torch.autocast(device_type="cuda", enabled=False):
+            return bwd(*args, **kwargs)
+    return decorate_bwd
diff --git a/torch-ext/megablocks/stk/backend/sputnik.py b/torch-ext/megablocks/stk/backend/sputnik.py
new file mode 100644
index 0000000000000000000000000000000000000000..220c947bc1e932e8c77cc30f4069e9930f1aa962
--- /dev/null
+++ b/torch-ext/megablocks/stk/backend/sputnik.py
@@ -0,0 +1,316 @@
+import torch
+
+from ..backend import triton_kernels as backend
+from ..backend.autocast import custom_bwd, custom_fwd
+
+
+def _standardize_shape(x, transpose):
+    if transpose:
+        return torch.Size((x[1], x[0]))
+    return x
+
+
+def _sparse_transpose(x):
+    return (torch.Size((x[0][1], x[0][0])), ) + x[1:]
+
+
+def _transpose_helper(x, transpose):
+    if isinstance(x, torch.Tensor):
+        return x.t() if transpose else x
+    if transpose:
+        x = _sparse_transpose(x)
+    return x + (transpose,)
+
+
+def _wrap(x):
+    if isinstance(x, torch.Tensor):
+        return (x,)
+    return x
+
+
+def _is_transposed(x):
+    return (not x.is_contiguous() and
+            x.stride()[0] == 1 and
+            x.stride()[1] == x.size()[0])
+
+
+def _call_helper(op, out, a, b, trans_a, trans_b):
+    args = (_wrap(_transpose_helper(a, trans_a)) +
+            _wrap(_transpose_helper(b, trans_b)))
+    if isinstance(out, tuple):
+        args = args + out
+    return op(*args)
+
+
+def _preprocess_inputs(lhs, rhs, dy):
+    if isinstance(lhs, torch.Tensor) and _is_transposed(lhs):
+        lhs = lhs.t()
+    if isinstance(rhs, torch.Tensor) and _is_transposed(rhs):
+        rhs = rhs.t()
+    if (isinstance(dy, torch.Tensor) and
+        not dy.is_contiguous() and
+        not _is_transposed(dy)):
+        dy = dy.contiguous()
+    if isinstance(dy, tuple) and not dy[1].is_contiguous():
+        dy = (dy[0], dy[1].contiguous()) + dy[2:]
+    return lhs, rhs, dy
+
+
+def _postprocess_outputs(x, transpose, grad):
+    if isinstance(x, torch.Tensor) and transpose:
+        return grad.t()
+    return grad
+
+
+def _lhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (rhs, dy) if trans_lhs else (dy, rhs)
+    trans_a = trans_lhs and trans_rhs
+    trans_b = trans_lhs or not trans_rhs
+    out = _call_helper(op, lhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(lhs, trans_lhs, out)
+
+
+def _rhs_gradient(op, lhs, rhs, dy, trans_lhs, trans_rhs):
+    lhs, rhs, dy = _preprocess_inputs(lhs, rhs, dy)
+
+    a, b = (dy, lhs) if trans_rhs else (lhs, dy)
+    trans_a = not trans_lhs or trans_rhs
+    trans_b = trans_lhs and trans_rhs
+    out = _call_helper(op, rhs, a, b, trans_a, trans_b)
+    return _postprocess_outputs(rhs, trans_rhs, out)
+
+
+class DSD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_a,
+                rhs):
+        ctx.save_for_backward(data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t,
+                              rhs)
+        ctx.shape = _standardize_shape(shape, transpose_a)
+        ctx.transpose_a = transpose_a
+
+        out = torch.empty(
+            (shape[0], rhs.size()[1]),
+            dtype=rhs.dtype,
+            device=rhs.device)
+
+        backend.dsd(shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_a,
+                    rhs,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = (ctx.shape,) + saved_tensors[:-1]
+        rhs = saved_tensors[-1]
+        trans_a = ctx.transpose_a
+        trans_b = _is_transposed(rhs)
+
+        ddata = None
+        if ctx.needs_input_grad[1]:
+            ddata = _lhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        drhs = None
+        if ctx.needs_input_grad[-1]:
+            op = dds if trans_b else dsd
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return None, ddata, None, None, None, None, None, None, None, drhs
+
+
+dsd = DSD.apply
+
+
+class DDS(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t,
+                transpose_b):
+        ctx.save_for_backward(lhs,
+                              data,
+                              offsets,
+                              row_indices,
+                              column_indices,
+                              offsets_t,
+                              column_indices_t,
+                              block_offsets_t)
+        ctx.shape = _standardize_shape(shape, transpose_b)
+        ctx.transpose_b = transpose_b
+        out = torch.empty((lhs.size()[0], shape[1]),
+                          dtype=lhs.dtype,
+                          device=lhs.device)
+        backend.dds(lhs,
+                    shape,
+                    data,
+                    offsets,
+                    row_indices,
+                    column_indices,
+                    offsets_t,
+                    column_indices_t,
+                    block_offsets_t,
+                    transpose_b,
+                    out)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs = saved_tensors[0]
+        rhs = (ctx.shape,) + saved_tensors[1:]
+        trans_a = _is_transposed(lhs)
+        trans_b = ctx.transpose_b
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dsd if trans_a else dds
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        ddata = None
+        if ctx.needs_input_grad[2]:
+            ddata = _rhs_gradient(sdd,
+                                  lhs,
+                                  rhs,
+                                  dy,
+                                  trans_a,
+                                  trans_b)
+        return dlhs, None, ddata, None, None, None, None, None, None, None
+
+
+dds = DDS.apply
+
+
+class SDD(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd
+    def forward(ctx,
+                lhs,
+                rhs,
+                shape,
+                data,
+                offsets,
+                row_indices,
+                column_indices,
+                offsets_t,
+                column_indices_t,
+                block_offsets_t):
+        ctx.save_for_backward(
+            lhs,
+            rhs,
+            offsets,
+            row_indices,
+            column_indices,
+            offsets_t,
+            column_indices_t,
+            block_offsets_t)
+        ctx.shape = shape
+        out = torch.empty(
+            data.shape,
+            dtype=lhs.dtype,
+            device=lhs.device)
+        backend.sdd(lhs,
+                    rhs,
+                    shape,
+                    out,
+                    offsets,
+                    row_indices,
+                    column_indices)
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dy):
+        saved_tensors = ctx.saved_tensors
+        lhs, rhs = saved_tensors[:2]
+        dy = (ctx.shape, dy) + saved_tensors[2:]
+        trans_a = _is_transposed(lhs)
+        trans_b = _is_transposed(rhs)
+
+        dlhs = None
+        if ctx.needs_input_grad[0]:
+            op = dds if trans_a else dsd
+            dlhs = _lhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        drhs = None
+        if ctx.needs_input_grad[1]:
+            op = dsd if trans_b else dds
+            drhs = _rhs_gradient(op,
+                                 lhs,
+                                 rhs,
+                                 dy,
+                                 trans_a,
+                                 trans_b)
+        return dlhs, drhs, None, None, None, None, None, None, None, None
+
+
+sdd = SDD.apply
+
+class RowIndices(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, shape, data, offsets, column_indices):
+        out = torch.empty(
+            column_indices.shape,
+            dtype=column_indices.dtype,
+            device=column_indices.device)
+        backend.row_indices(shape, data, offsets, column_indices, out)
+        return out
+
+
+row_indices = RowIndices.apply
diff --git a/torch-ext/megablocks/stk/backend/triton_kernels.py b/torch-ext/megablocks/stk/backend/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..c535309f3321249f475367164a558f94a4f8eb86
--- /dev/null
+++ b/torch-ext/megablocks/stk/backend/triton_kernels.py
@@ -0,0 +1,393 @@
+import torch
+import triton
+import triton.language as tl
+from dataclasses import dataclass
+
+@dataclass
+class TritonConfig:
+    BLOCK_M: int = 128
+    BLOCK_N: int = 128
+    BLOCK_K: int = 32
+    BLOCK_SIZE: int = 128
+    NUM_STAGES: int = 4
+    NUM_WARPS: int = 4
+
+def _validate_matmul_dims(M: int, K: int, N: int):
+    error_string = "incompatible dimensions: tensor has dim with length: {}, which must be divisible by {}"
+    assert M % TritonConfig.BLOCK_M == 0, error_string.format(M, TritonConfig.BLOCK_M)
+    assert K % TritonConfig.BLOCK_K == 0, error_string.format(K, TritonConfig.BLOCK_K)
+    assert N % TritonConfig.BLOCK_N == 0, error_string.format(N, TritonConfig.BLOCK_N)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _sdd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+    # matrix multiplication
+    pid = tl.program_id(0)
+    pid_m = tl.load(row_indices + pid)
+    pid_n = tl.load(column_indices + pid)
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    # pointers
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        a = tl.load(A)
+        b = tl.load(B)
+        acc += tl.dot(a, b)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+    #Store to sparse matrix
+    acc = acc.to(C.dtype.element_ty)
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    cm = tl.arange(0, BLOCK_M)
+    cn = tl.arange(0, BLOCK_N)
+    C = C + pid * BLOCK_ELEMENTS + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dsd_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_m)
+    end_inx = tl.load(offsets + pid_m + 1)
+
+    # pointers to sparse matrix
+    rm =  tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to dense matrix
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+    ak_sub_incr = BLOCK_K * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+    bk_block_incr = BLOCK_SIZE * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_A:
+            ptr_A = A + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+        else:
+            ptr_A = A + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * ak_sub_incr
+
+        ptr_B = B + tl.load(column_indices + start_inx + block_inx) * bk_block_incr + sub_block_inx * bk_sub_incr
+
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+@triton.autotune(
+    configs=[
+        # basic configs for compute-bound matmuls
+        triton.Config({
+            'BLOCK_M': TritonConfig.BLOCK_M,
+            'BLOCK_N': TritonConfig.BLOCK_N,
+            'BLOCK_K': TritonConfig.BLOCK_K,
+            'BLOCK_SIZE': TritonConfig.BLOCK_SIZE
+        }, num_stages=TritonConfig.NUM_STAGES, num_warps=TritonConfig.NUM_WARPS),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def _dds_kernel(A, B, C, M, N, K,
+            stride_am, stride_ak,
+            stride_bk, stride_bn,
+            stride_cm, stride_cn,
+            row_indices, column_indices, offsets,
+            block_offsets_t, trans_A: tl.constexpr, trans_B: tl.constexpr,
+            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+            BLOCK_SIZE: tl.constexpr, GROUP_M: tl.constexpr, ACC_TYPE: tl.constexpr,
+            ):
+
+    # matrix multiplication
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    num_pid_m = tl.num_programs(0)
+    num_pid_n = tl.num_programs(1)
+    pid_n, pid_m = tl.swizzle2d(pid_n, pid_m, num_pid_n, num_pid_m, GROUP_M)
+
+    start_inx = tl.load(offsets + pid_n)
+    end_inx = tl.load(offsets + pid_n + 1)
+
+    # pointers to dense matrix
+    rm =  pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rak = tl.arange(0, BLOCK_K)
+
+    A += (rm[:, None] * stride_am + rak[None, :] * stride_ak)
+
+    # pointers to sparse matrix
+    rn = tl.arange(0, BLOCK_N)
+    rbk = tl.arange(0, BLOCK_K)
+    B += (rbk[:, None] * stride_bk + rn[None, :] * stride_bn)
+
+    # do matrix multiplication
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    nsub_blocks = tl.cdiv(BLOCK_SIZE, BLOCK_K)
+
+    BLOCK_ELEMENTS = BLOCK_SIZE * BLOCK_SIZE
+
+    ak_sub_incr = BLOCK_K * stride_ak
+    ak_block_incr = BLOCK_SIZE * stride_ak
+    bk_sub_incr = BLOCK_K * stride_bk
+
+    for k in range(nsub_blocks * (end_inx - start_inx)):
+        sub_block_inx = k % nsub_blocks
+        block_inx = k // nsub_blocks
+
+        if trans_B:
+            ptr_B = B + (start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+        else:
+            ptr_B = B + tl.load(block_offsets_t + start_inx + block_inx) * BLOCK_ELEMENTS + sub_block_inx * bk_sub_incr
+
+        ptr_A = A + tl.load(column_indices + start_inx + block_inx) * ak_block_incr + sub_block_inx * ak_sub_incr
+        a = tl.load(ptr_A)
+        b = tl.load(ptr_B)
+        acc += tl.dot(a, b)
+
+    acc = acc.to(C.dtype.element_ty)
+    cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    C = C + (cm[:, None] * stride_cm + cn[None, :] * stride_cn)
+    tl.store(C, acc, mask=True)
+
+def dsd(shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_a,
+        rhs,
+        out
+    ):
+
+    device = rhs.device
+    trans_A = transpose_a
+    trans_B = False
+
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if rhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = data.stride(1), data.stride(2)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+    a_column_indices  = column_indices
+    a_offsets = offsets
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = data.stride(2), data.stride(1)
+        a_column_indices, a_offsets = column_indices_t, offsets_t
+
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _dsd_kernel[grid](
+        data.data, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, a_column_indices, a_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+    # return out
+
+def dds(lhs,
+        shape,
+        data,
+        offsets,
+        row_indices,
+        column_indices,
+        offsets_t,
+        column_indices_t,
+        block_offsets_t,
+        transpose_b,
+        out
+    ):
+
+    device = lhs.device
+    trans_B = transpose_b
+    trans_A = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+
+    # checks constraints
+    assert lhs.shape[1] == shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if lhs.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = data.stride(1), data.stride(2)
+    b_column_indices  = column_indices_t
+    b_offsets = offsets_t
+
+    # launch kernel
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_M']), triton.cdiv(N, META['BLOCK_N']))
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = data.stride(2), data.stride(1)
+        b_column_indices, b_offsets = column_indices, offsets
+
+    _dds_kernel[grid](
+        lhs, data, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(0), out.stride(1),
+        row_indices, b_column_indices, b_offsets,
+        block_offsets_t, trans_A, trans_B,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+    )
+
+def sdd(lhs,
+        rhs,
+        shape,
+        out,
+        offsets,
+        row_indices,
+        column_indices
+    ):
+
+    device = out.device
+    trans_A = False
+    trans_B = False
+
+    if lhs.stride(0) > 1 and lhs.stride(1) > 1:
+        trans_A = True
+    if rhs.stride(0) > 1 and rhs.stride(1) > 1:
+        trans_B = True
+
+    # checks constraints
+    assert lhs.shape[1] == rhs.shape[0], "incompatible dimensions"
+    M, K = lhs.shape
+    _, N = rhs.shape
+
+    _validate_matmul_dims(M, K, N)
+
+    # accumulator types
+    ACC_TYPE = tl.float32 if out.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+
+    # launch kernel
+    nnz_blocks = len(row_indices)
+    grid = lambda META: (nnz_blocks,)
+
+    stride_am, stride_ak = lhs.stride(0), lhs.stride(1)
+    stride_bk, stride_bn = rhs.stride(0), rhs.stride(1)
+
+    if trans_A:
+        stride_am, stride_ak = lhs.stride(1), lhs.stride(0)
+    if trans_B:
+        stride_bk, stride_bn = rhs.stride(1), rhs.stride(0)
+
+    _sdd_kernel[grid](
+        lhs, rhs, out, M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        out.stride(1), out.stride(2),
+        row_indices, column_indices,
+        GROUP_M=128, ACC_TYPE=ACC_TYPE
+        )
+
+@triton.jit
+def _row_indices_kernel(offsets, out):
+    pid = tl.program_id(0)
+    row_offset = tl.load(offsets + pid)
+    nnz_blocks = tl.load(offsets + pid + 1) - row_offset
+    for nnz_block in range(nnz_blocks):
+        tl.store(out + row_offset + nnz_block, pid)
+
+def row_indices(
+    shape, data, offsets, column_indices, out
+):
+    block_rows = len(offsets) - 1
+    _row_indices_kernel[(block_rows, )](offsets, out)
diff --git a/torch-ext/megablocks/stk/matrix.py b/torch-ext/megablocks/stk/matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f42263d6aada287adbfa52a61fe950162a9e28
--- /dev/null
+++ b/torch-ext/megablocks/stk/matrix.py
@@ -0,0 +1,329 @@
+import numpy as np
+import torch
+
+# 1. Add heavyweight (data) validation helper.
+# 2. Add construction helpers
+# 3. Make indentation consistent
+# 4. Replace asserts with descriptive errors.
+
+##
+### Validation helpers.
+##
+
+
+def _validate_matrix(shape, data, row_indices, column_indices, offsets):
+    # Data should be [nnz, block_size, block_size]
+    if data.dim() == 1:
+        data = torch.reshape(data, [data.numel(), 1, 1])
+
+    # Blocks should be square.
+    if data.shape[-2] != data.shape[-1]:
+        raise ValueError(
+            "Expected square blocking in data. "
+            f"Got block shape {[data.shape[-2], data.shape[-1]]}")
+
+    # Flatten batch dimensions on data - original shape preserved
+    # in shape argument.
+    block_size = data.shape[-1]
+    data = data.view([-1, block_size, block_size])
+
+    if data.dim() != 3:
+        raise ValueError(
+            "Expected 3D shape for data (nnz, block, block). "
+            f"Got shape {data.dim()}D shape.")
+
+    block_size = data.shape[1]
+    if shape[-2] % block_size != 0 or shape[-1] % block_size != 0:
+        raise ValueError(
+            "Matrix shape must be dividible by blocking. "
+            f"Got shape {shape} with "
+            f"{[block_size, block_size]} blocking.")
+
+    if np.prod(shape) < data.numel():
+        raise ValueError(
+            "Invalid matrix. Number of nonzeros exceeds matrix capacity "
+            f"({data.numel()} v. {np.prod(shape)})")
+
+    if row_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D row_indices. Got {row_indices.dim()}D row_indices.")
+
+    if column_indices.dim() != 1:
+        raise ValueError(
+            f"Expected 1D column_indices. Got {column_indices.dim()}D column_indices.")
+
+    if offsets.dim() != 1:
+        raise ValueError(
+            f"Expected 1D offsets. Got {offsets.dim()}D offsets.")
+
+    if row_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {row_indices.numel()} row_indices for {data.shape[0]} blocks")
+
+    if column_indices.numel() != data.shape[0]:
+        raise ValueError(
+            "Expected 1 index per nonzero block. "
+            f"Got {column_indices.numel()} column_indices for {data.shape[0]} blocks")
+
+    block_rows = np.prod(shape[:-1]) / block_size
+    if offsets.numel() != block_rows + 1:
+        raise ValueError(
+            "Expected one offset per block row plus one. "
+            f"Got {offsets.numel()} offsets with {block_rows} block rows.")
+
+    is_cuda = (data.is_cuda and
+               row_indices.is_cuda and
+               column_indices.is_cuda and
+               offsets.is_cuda)
+    is_cpu = (not data.is_cuda and
+              not row_indices.is_cuda and
+              not column_indices.is_cuda and
+              not offsets.is_cuda)
+    if not (is_cuda or is_cpu):
+        raise ValueError(
+            "Expected data & meta-data on common device. "
+            f"Got data on {data.device}, row_indices on {row_indices.device} "
+            f"column_indices on {column_indices.device} and "
+            f"offsets on {offsets.device}.")
+
+    if data.dtype != torch.float16:
+        raise ValueError(
+            f"Expected float16 data. Got {data.dtype} data.")
+    if row_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 row_indices. Got {row_indices.dtype} row_indices.")
+    if column_indices.dtype != torch.int16:
+        raise ValueError(
+            f"Expected int16 column_indices. Got {column_indices.dtype} column_indices.")
+    if offsets.dtype != torch.int32:
+        raise ValueError(
+            f"Expected int32 offsets. Got {offsets.dtype} offsets.")
+    return data
+
+
+def _transpose(size, data, row_indices, column_indices, offsets):
+    block_columns = size[1] // data.shape[1]
+
+    # Sort row indices by column indices to get the transposed matrix's
+    # column indices.
+    gather_indices = column_indices.argsort()
+    column_indices_t = row_indices.gather(0, gather_indices)
+    block_offsets_t = gather_indices.int()
+
+    # NOTE: Histogram is not implemented for any integer type on CPU. Do
+    # the histogram in 32-bit float, which can exactly represent 16-bit
+    # integers.
+    column_indices_float = column_indices.float()
+
+    zero = torch.zeros((1,), dtype=torch.int32, device=data.device)
+    nnz_per_column = column_indices_float.histc(block_columns, 0, block_columns)
+    nnz_per_column = nnz_per_column.int()
+    offsets_t = torch.cat([zero, nnz_per_column.cumsum(0, dtype=torch.int32)])
+    return column_indices_t, offsets_t, block_offsets_t
+
+
+class Matrix(torch.nn.Module):
+    """A matrix stored in sparse format.
+
+    Underlying format is block compressed sparse row (BCSR).
+
+    TODO(tgale): Make this mirror torch.Tensor API as much as possible.
+    """
+
+    def __init__(self,
+                 size,
+                 data,
+                 row_indices,
+                 column_indices,
+                 offsets,
+                 column_indices_t=None,
+                 offsets_t=None,
+                 block_offsets_t=None):
+        super().__init__()
+        self._size = size
+        self._data = data
+        self._row_indices = row_indices
+        self._column_indices = column_indices
+        self._offsets = offsets
+
+        # Produce the transpose meta-data if it is not passed in.
+        if ((column_indices_t is None) or (offsets_t is None) or
+            (block_offsets_t is None)):
+            column_indices_t, offsets_t, block_offsets_t = _transpose(
+                size, data, row_indices, column_indices, offsets)
+        self._column_indices_t = column_indices_t
+        self._offsets_t = offsets_t
+        self._block_offsets_t = block_offsets_t
+
+        self._transposed = False
+
+        # Validate that our metadata will not overflow.
+        max_dim = np.iinfo(np.int16).max * self.blocking
+        if column_indices.dtype == torch.int16:
+            if size[0] > max_dim or size[1] > max_dim:
+                raise ValueError(
+                    "Sparse matrix with shape {size} exceeds representable "
+                    "size with 16-bit indices.")
+
+    def validate(self):
+        _validate_matrix(self._size,
+                         self._data,
+                         self._row_indices,
+                         self._column_indices,
+                         self._offsets)
+
+        # TODO(tgale): Add heavyweight data validation.
+
+    def to(self, device):
+        # TODO(tgale): Handle type conversions here. We
+        # need to set the appropriate meta-data type for
+        # the given floating-point type.
+        self._data = self._data.to(device)
+        self._row_indices = self._row_indices.to(device)
+        self._column_indices = self._column_indices.to(device)
+        self._offsets = self._offsets.to(device)
+        self._column_indices_t = self._column_indices_t.to(device)
+        self._offsets_t = self._offsets_t.to(device)
+        self._block_offsets_t = self._block_offsets_t.to(device)
+        return self
+
+    def cuda(self):
+        return self.to(torch.cuda.current_device())
+
+    def clone(self):
+        return Matrix(
+            self.size(),
+            self.data.clone(),
+            self.row_indices.clone(),
+            self.column_indices.clone(),
+            self.offsets.clone(),
+            self.column_indices_t.clone(),
+            self.offsets_t.clone(),
+            self.block_offsets_t.clone())
+
+    def t(self):
+        if self.dim() != 2:
+            raise ValueError(
+                "t() expects a tensor with <= 2 dimensions, "
+                f"but self is {self.dim()}D.")
+        out = Matrix(self.size(),
+                     self.data,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        out._transposed = not self._transposed
+        out._size = torch.Size((self._size[1], self._size[0]))
+        return out
+
+    def contiguous(self):
+        raise ValueError("Not yet implemented.")
+
+    def is_contiguous(self):
+        return not self._transposed
+
+    @property
+    def is_cuda(self):
+        return self._data.is_cuda
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def size(self):
+        return self._size
+
+    @property
+    def shape(self):
+        return self.size()
+
+    def dim(self):
+        return len(self._size)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def row_indices(self):
+        return self._row_indices
+
+    @property
+    def column_indices(self):
+        return self._column_indices
+
+    @property
+    def offsets(self):
+        return self._offsets
+
+    @property
+    def offsets_t(self):
+        return self._offsets_t
+
+    @property
+    def column_indices_t(self):
+        return self._column_indices_t
+
+    @property
+    def block_offsets_t(self):
+        return self._block_offsets_t
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    @property
+    def nnz(self):
+        return self.data.numel()
+
+    @property
+    def blocking(self):
+        return self.data.shape[1]
+
+    @property
+    def requires_grad(self):
+        return self.data.requires_grad
+
+    def requires_grad_(self, x):
+        self.data.requires_grad_(x)
+        return self
+
+    def view(self, *shape):
+        assert self.is_contiguous()
+        if shape[-1] != self.size()[-1]:
+            raise ValueError(
+                "Can't change view on compressed dimension. "
+                f"{self.size()[-1]} v. {shape[-1]}.")
+        if np.prod(shape) != np.prod(self.size()):
+            raise ValueError(
+                "Mismatch in numel of Matrix and new shape. "
+                f"{np.prod(self.size())} v. {np.prod(shape)}")
+        return Matrix(shape,
+                      self.data,
+                      self.row_indices,
+                      self.column_indices,
+                      self.offsets,
+                      self.column_indices_t,
+                      self.offsets_t,
+                      self.block_offsets_t)
+
+    @property
+    def grad(self):
+        # TODO(tgale): Make sure this mirrors torch.Tensor
+        # behavior in the case where we ask for the gradient
+        # of a non-contiguous tensor.
+        size = self.size()
+        if not self.is_contiguous():
+            size = torch.Size((size[1], size[0]))
+        out = Matrix(size,
+                     self.data.grad,
+                     self.row_indices,
+                     self.column_indices,
+                     self.offsets,
+                     self.column_indices_t,
+                     self.offsets_t,
+                     self.block_offsets_t)
+        return out if self.is_contiguous() else out.t()
diff --git a/torch-ext/megablocks/stk/ops/__init__.py b/torch-ext/megablocks/stk/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc873b236f4cd4036964c016a4036e3ce5ebf1ac
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/__init__.py
@@ -0,0 +1,3 @@
+from .linear_ops import dds, dsd, sdd
+from .matrix_ops import ones_like, row_indices, sum, to_dense, to_sparse
+from .eltwise_ops import mul
diff --git a/torch-ext/megablocks/stk/ops/eltwise_ops.py b/torch-ext/megablocks/stk/ops/eltwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d7332320250fd01fa60e60528f19de3e8ed03
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/eltwise_ops.py
@@ -0,0 +1,28 @@
+from ..matrix import Matrix
+
+def mul(a, b):
+    """Performs element-wise multiplication of matrices a and b.
+
+    It is the user's responsibility to make sure that a and b
+    follow the same matrix topology. This function assumes it is safe
+    to use the topoplogy of a.
+
+    Args:
+        a: stk.Matrix.
+        b: stk.Matrix with a's matrix topology.
+
+    Returns:
+        stk.Matrix where the entries correspond to torch.mul(a, b).
+    """
+    assert isinstance(a, Matrix)
+    assert isinstance(b, Matrix)
+    assert a.size() == b.size()
+
+    return Matrix(a.size(),
+                  a.data * b.data,
+                  a.row_indices,
+                  a.column_indices,
+                  a.offsets,
+                  a.column_indices_t,
+                  a.offsets_t,
+                  a.block_offsets_t)
diff --git a/torch-ext/megablocks/stk/ops/eltwise_ops_test.py b/torch-ext/megablocks/stk/ops/eltwise_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bfd4f6af77042d3c5bdb1fe18d00e457478d46
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/eltwise_ops_test.py
@@ -0,0 +1,86 @@
+import unittest
+import itertools
+import torch
+from absl.testing import parameterized
+
+import stk
+from stk.ops.linear_ops_test import allclose, _dense_and_sparse
+
+_MATRIX_SIZES = (
+    (128, 128, 0.0),
+    (256, 256, 0.5),
+    (2048, 1024, 0.8),
+    (512, 128, 0.0),
+    (128, 512, 0.0),
+    (1024, 512, 0.0),
+    (1024, 512, 0.5),
+    (1024, 512, 0.75),
+    (512, 1024, 0.0),
+    (512, 1024, 0.5),
+    (512, 1024, 0.75),
+    (1024, 1024, 0.0),
+    (1024, 1024, 0.5),
+    (1024, 1024, 0.75),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _DTYPE)
+    testcases = [(*size, 128, dtype) for 
+        (size, dtype) in testcases]
+    return testcases
+
+_ELTWISE_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse_like(x, std=0.1):
+    dense_data = torch.randn_like(x.data, device=x.device) * std
+    sparse = stk.Matrix(x.size(),
+                        dense_data,
+                        x.row_indices,
+                        x.column_indices,
+                        x.offsets)
+    dense = stk.ops.to_dense(sparse)
+
+    return (dense.requires_grad_(True),
+            sparse.requires_grad_(True))
+
+@parameterized.parameters(_ELTWISE_OP_TESTS)
+class EltwiseOpsTest(parameterized.TestCase):
+
+    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+
+        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+        b_dense, b = _dense_and_sparse_like(a)
+
+        out = stk.ops.mul(a, b)
+        expected_out = torch.mul(a_dense, b_dense)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size(), out.size())
+        self.assertTrue(allclose(out, expected_out)) 
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = a_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad =  stk.ops.to_dense(b.grad)
+        expected_grad = b_dense.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size(), grad.size())
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/stk/ops/linear_ops.py b/torch-ext/megablocks/stk/ops/linear_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d277c8c07f9e30addc31900a12175c8a1f4d7ad
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/linear_ops.py
@@ -0,0 +1,59 @@
+import torch
+
+from ..backend import sputnik
+from ..matrix import Matrix
+
+
+def dsd(a, b):
+    assert isinstance(a, Matrix)
+    assert isinstance(b, torch.Tensor)
+    return sputnik.dsd(
+        a.size(),
+        a.data, a.offsets,
+        a.row_indices,
+        a.column_indices,
+        a.offsets_t,
+        a.column_indices_t,
+        a.block_offsets_t,
+        not a.is_contiguous(),
+        b)
+
+
+def dds(a, b):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, Matrix)
+    return sputnik.dds(
+        a,
+        b.size(),
+        b.data, b.offsets,
+        b.row_indices,
+        b.column_indices,
+        b.offsets_t,
+        b.column_indices_t,
+        b.block_offsets_t,
+        not b.is_contiguous())
+
+
+def sdd(a, b, topo):
+    assert isinstance(a, torch.Tensor)
+    assert isinstance(b, torch.Tensor)
+    assert isinstance(topo, Matrix)
+    assert topo.is_contiguous()
+    out = sputnik.sdd(
+        a, b,
+        topo.size(),
+        topo.data,
+        topo.offsets,
+        topo.row_indices,
+        topo.column_indices,
+        topo.offsets_t,
+        topo.column_indices_t,
+        topo.block_offsets_t)
+    return Matrix(topo.size(),
+                  out,
+                  topo.row_indices,
+                  topo.column_indices,
+                  topo.offsets,
+                  topo.column_indices_t,
+                  topo.offsets_t,
+                  topo.block_offsets_t)
diff --git a/torch-ext/megablocks/stk/ops/linear_ops_test.py b/torch-ext/megablocks/stk/ops/linear_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced1d782fbc9f9ca16b3449239f1588dc5ff5e00
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/linear_ops_test.py
@@ -0,0 +1,216 @@
+import unittest
+import itertools
+import numpy as np
+import torch
+from absl.testing import parameterized
+
+import stk
+
+
+def allclose(x, y, pct=0.25):
+    mask = torch.isclose(x, y, rtol=5e-2)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+
+
+# An assortment of problems designed to make sure
+# the bindings are operating correctly.
+_MATRIX_SIZES = (
+    (128, 128, 128, 0.0),
+    (256, 256, 256, 0.5),
+    (2048, 1024, 512, 0.8),
+    (512, 128, 128, 0.0),
+    (128, 128, 512, 0.0),
+    (1024, 512, 512, 0.0),
+    (1024, 512, 512, 0.5),
+    (1024, 512, 512, 0.75),
+    (512, 512, 1024, 0.0),
+    (512, 512, 1024, 0.5),
+    (512, 512, 1024, 0.75),
+    (1024, 1024, 1024, 0.0),
+    (1024, 1024, 1024, 0.5),
+    (1024, 1024, 1024, 0.75),
+)
+
+_TRANSPOSE = (
+    (False, False),
+    (False, True),
+    (True, False),
+    (True, True),
+)
+
+_DTYPE = (
+    torch.float16, torch.bfloat16
+)
+
+def _generate_testcases():
+    testcases = itertools.product(_MATRIX_SIZES, _TRANSPOSE, _DTYPE)
+    testcases = [(*size, *trans, 128, dtype) for 
+        (size, trans, dtype) in testcases]
+    return testcases
+
+_LINEAR_OP_TESTS = _generate_testcases()
+
+def _dense_and_sparse(rows, cols, sparsity, blocking, dtype, std=0.1):
+    mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+    dense = (torch.randn(rows, cols) * std * mask).type(dtype)
+    sparse = stk.ops.to_sparse(dense, blocking)
+    cuda_device = torch.device("cuda")
+    return (dense.to(cuda_device).requires_grad_(True),
+            sparse.to(cuda_device).requires_grad_(True))
+
+
+def _dense(rows, cols, dtype, std=0.1):
+    cuda_device = torch.device("cuda")
+    out = (torch.randn(rows, cols) * std).type(dtype)
+    return out.to(cuda_device).requires_grad_(True)
+
+
+def _dense_2x(rows, cols, dtype):
+    a = _dense(rows, cols, dtype)
+    return a, a.detach().requires_grad_(True)
+
+
+def _with_transpose(op, a, b, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b)
+
+
+def _mmm(a, b, topo):
+    mask = stk.ops.to_dense(stk.ops.ones_like(topo))
+    return torch.mm(a, b) * mask
+
+
+def _sparse_out_with_transpose(op, a, b, topo, trans_a, trans_b):
+    a = a.t() if trans_a else a
+    b = b.t() if trans_b else b
+    return op(a, b, topo)
+
+
+def _mask(x, mask):
+    mask = stk.ops.to_dense(stk.ops.ones_like(mask))
+    return x * mask
+
+
+@parameterized.parameters(*_LINEAR_OP_TESTS)
+class LinearOpsTest(parameterized.TestCase):
+
+    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = stk.ops.to_dense(a.grad)
+        expected_grad = _mask(a_dense.grad, a.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        out.sum().backward()
+
+        # Validate the results.
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = stk.ops.to_dense(b.grad)
+        expected_grad = _mask(b_dense.grad, b.grad)
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+        # Construct the operands.
+        a_shape = (k, m) if trans_a else (m, k)
+        a, acp = _dense_2x(*a_shape, dtype)
+        b_shape = (n, k) if trans_b else (k, n)
+        b, bcp = _dense_2x(*b_shape, dtype)
+        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+
+        # Execute the matmul.
+        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+
+        # Compute the gradients w.r.t. the inputs.
+        expected_out.sum().backward()
+        stk.ops.sum(out).backward()
+
+        # Validate the results.
+        out = stk.ops.to_dense(out)
+        self.assertEqual(out.dim(), 2)
+        self.assertEqual(expected_out.size()[0], out.size()[0])
+        self.assertEqual(expected_out.size()[1], out.size()[1])
+        self.assertTrue(allclose(out, expected_out))
+
+        # LHS gradient.
+        grad = a.grad
+        expected_grad = acp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+        # RHS gradient.
+        grad = b.grad
+        expected_grad = bcp.grad
+        self.assertEqual(grad.dim(), 2)
+        self.assertEqual(expected_grad.size()[0], grad.size()[0])
+        self.assertEqual(expected_grad.size()[1], grad.size()[1])
+        self.assertTrue(allclose(grad, expected_grad))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/stk/ops/matrix_ops.py b/torch-ext/megablocks/stk/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..447c72dc73439d84f58c917676cc04e64f13e97d
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/matrix_ops.py
@@ -0,0 +1,98 @@
+from ..backend import sputnik
+from ..matrix import Matrix
+import torch
+import numpy as np
+
+
+@torch.no_grad()
+def row_indices(shape, data, offsets, column_indices):
+    return sputnik.row_indices(shape, data, offsets, column_indices)
+
+
+# TODO(tgale): Replace this helper with a custom kernel. This operation
+# is much simpler to do than how it's currently implemented.
+@torch.no_grad()
+def _expand_for_blocking(idxs, blocking):
+    # Duplicate for block column dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, 2]).repeat(1, blocking, 1)
+
+    # Update the column indices.
+    idxs[:, :, 1] *= blocking
+    idxs[:, :, 1] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking])
+
+    # Duplicate for block row dimension.
+    idxs = torch.reshape(idxs, [idxs.size()[0], 1, blocking, 2])
+    idxs = idxs.repeat(1, blocking, 1, 1)
+
+    # Update the row indices.
+    idxs[:, :, :, 0] *= blocking
+    idxs[:, :, :, 0] += torch.reshape(torch.arange(blocking, device=idxs.device), [1, blocking, 1])
+    idxs = torch.reshape(idxs, [-1, 2])
+    return idxs
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_dense(x):
+    assert isinstance(x, Matrix)
+
+    shape = (np.prod(x.shape[:-1]), x.shape[-1])
+    row_idxs = x.row_indices.type(torch.int32)
+    col_idxs = x.column_indices.type(torch.int32)
+    indices = _expand_for_blocking(torch.stack([row_idxs, col_idxs], dim=1), x.blocking)
+    indices = (indices[:, 0] * shape[1] + indices[:, 1]).type(torch.int64)
+
+    out = torch.zeros(shape[0] * shape[1], dtype=x.dtype, device=x.device)
+    out.scatter_(0, indices, x.data.flatten())
+    return out.reshape(x.size())
+
+
+@torch.no_grad()
+def _mask(x, blocking=1):
+    assert x.dim() == 2
+    assert x.size()[0] % blocking == 0
+    assert x.size()[1] % blocking == 0
+    block_rows = x.size()[0] // blocking
+    block_cols = x.size()[1] // blocking
+    x = torch.reshape(x, [block_rows, blocking, block_cols, blocking])
+    x = torch.sum(torch.abs(x), dim=(1, 3))
+    return x != 0
+
+
+# TODO(tgale): Add input type checking.
+@torch.no_grad()
+def to_sparse(x, blocking=1):
+    m = _mask(x, blocking)
+
+    # TODO(tgale): Set to appropriate type for input matrix.
+    row_nnzs = torch.sum(m, dim=1).type(torch.int32)
+    zeros = torch.zeros((1,), dtype=row_nnzs.dtype, device=row_nnzs.device)
+    offsets = torch.cat([zeros, torch.cumsum(row_nnzs, dim=0)])
+    offsets = offsets.type(torch.int32)
+
+    indices = torch.nonzero(m).type(torch.int16)
+    row_indices = indices[:, 0]
+    column_indices = indices[:, 1]
+
+    # Nonzero indices in the dense matrix.
+    nonzero_indices = torch.nonzero(m)
+    nonzero_indices = _expand_for_blocking(nonzero_indices, blocking)
+    nonzero_indices = nonzero_indices[:, 0] * x.size()[1] + nonzero_indices[:, 1]
+
+    # Gather the data and construct the sparse matrix.
+    data = torch.gather(x.flatten(), dim=0, index=nonzero_indices)
+    data = torch.reshape(data, [-1, blocking, blocking])
+    return Matrix(x.size(), data, row_indices, column_indices, offsets)
+
+
+@torch.no_grad()
+def ones_like(x):
+    return Matrix(x.size(),
+                  torch.ones_like(x.data),
+                  x.row_indices,
+                  x.column_indices, x.offsets)
+
+
+def sum(x):
+    assert isinstance(x, Matrix)
+    return x.data.sum()
diff --git a/torch-ext/megablocks/stk/ops/matrix_ops_test.py b/torch-ext/megablocks/stk/ops/matrix_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af04c0760483e578f93303dc457415948a2a34c
--- /dev/null
+++ b/torch-ext/megablocks/stk/ops/matrix_ops_test.py
@@ -0,0 +1,62 @@
+import unittest
+
+from absl.testing import parameterized
+import stk
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class MatrixOpsTest(parameterized.TestCase):
+
+    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+        x = (torch.randn(rows, cols) * mask).type(torch.float16)
+
+        # Convert the matrix to sparse format.
+        sparse_x = stk.ops.to_sparse(x, blocking)
+
+        # Validate the matrix.
+        sparse_x.validate()
+
+        # Validate the shape.
+        self.assertEqual(sparse_x.dim(), 2)
+        self.assertEqual(sparse_x.size()[0], rows)
+        self.assertEqual(sparse_x.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(sparse_x.nnz, nnz)
+
+        # Convert back to dense format.
+        dense_x = stk.ops.to_dense(sparse_x)
+
+        # Validate the shape.
+        self.assertEqual(dense_x.dim(), 2)
+        self.assertEqual(dense_x.size()[0], rows)
+        self.assertEqual(dense_x.size()[1], cols)
+
+        # Validate the sparsity
+        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+
+        # Validate the output.
+        self.assertTrue(torch.all(torch.eq(x, dense_x)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/megablocks/stk/random/__init__.py b/torch-ext/megablocks/stk/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2576d1ca27283f77569a9a620c7c99fa68aaf30e
--- /dev/null
+++ b/torch-ext/megablocks/stk/random/__init__.py
@@ -0,0 +1,2 @@
+# from stk.random.random_ops import dense_mask, mask, randn
+from .random_ops import dense_mask, mask, randn
diff --git a/torch-ext/megablocks/stk/random/random_ops.py b/torch-ext/megablocks/stk/random/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36771e0eb8e7abf46bcb3b136b5fb1d29df93
--- /dev/null
+++ b/torch-ext/megablocks/stk/random/random_ops.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+from ..ops import matrix_ops
+
+
+@torch.no_grad()
+def dense_mask(rows, cols, sparsity, blocking=1):
+  assert sparsity >= 0.0 and sparsity <= 1.0
+  assert rows % blocking == 0 and cols % blocking == 0
+
+  block_rows, block_cols = (rows // blocking, cols // blocking)
+  nnz = round(block_rows * block_cols * (1 - sparsity))
+
+  out = np.ones(block_rows * block_cols)
+  mask = np.random.choice(out.size, out.size - nnz, replace=False)
+  out[mask] = 0.0
+
+  out = np.tile(
+    np.reshape(out, [block_rows, 1, block_cols, 1]),
+    (1, blocking, 1, blocking))
+  out = np.reshape(out, [rows, cols])
+  return torch.from_numpy(out.astype(np.float32))
+
+
+@torch.no_grad()
+def mask(m, n, sparsity, blocking=1):
+    out = dense_mask(m, n, sparsity, blocking).type(torch.float16)
+    return matrix_ops.to_sparse(out, blocking=blocking)
+
+
+@torch.no_grad()
+def randn(shape, sparsity, blocking=1):
+  shape_2d = (np.prod(shape[:-1]), shape[-1])
+  out = mask(*shape_2d, sparsity, blocking)
+  out.data.copy_(torch.randn(*out.data.shape))
+  return out.view(*shape)
diff --git a/torch-ext/megablocks/stk/random/random_ops_test.py b/torch-ext/megablocks/stk/random/random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..587b44ec890c861879c6296b8f9028f5d99ab82f
--- /dev/null
+++ b/torch-ext/megablocks/stk/random/random_ops_test.py
@@ -0,0 +1,73 @@
+import unittest
+
+from absl.testing import parameterized
+from . import random
+import torch
+
+
+@parameterized.parameters(
+    (8, 16, 0.0, 1),
+    (8, 16, 0.5, 1),
+    (8, 16, .95, 1),
+    (16, 8, 0.0, 1),
+    (16, 8, 0.5, 1),
+    (16, 8, .95, 1),
+    (8, 16, 0.0, 8),
+    (8, 16, 0.5, 8),
+    (8, 16, 1.0, 8),
+    (16, 8, 0.0, 8),
+    (16, 8, 0.5, 8),
+    (16, 8, 1.0, 8),
+    (128, 256, 0.5, 16),
+    (256, 128, 0.75, 32),
+    (512, 512, .875, 128))
+class RandomOpsTest(parameterized.TestCase):
+
+    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+        mask = random.dense_mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(
+            torch.count_nonzero(mask).item(),
+            nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask, 0),
+                torch.eq(mask, 1))))
+
+    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+        mask = random.mask(
+            rows, cols, sparsity, blocking)
+
+        # Validate the matrix.
+        mask.validate()
+
+        # Validate the shape.
+        self.assertEqual(mask.dim(), 2)
+        self.assertEqual(mask.size()[0], rows)
+        self.assertEqual(mask.size()[1], cols)
+
+        # Validate the sparsity.
+        numblocks = rows // blocking * cols // blocking
+        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+        self.assertEqual(mask.nnz, nnz)
+
+        # Check values are zero or one.
+        self.assertTrue(
+            torch.all(torch.logical_or(
+                torch.eq(mask.data, 0),
+                torch.eq(mask.data, 1))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6db122dd334c8d615a40da3072c159c100826e2e
--- /dev/null
+++ b/torch-ext/torch_binding.cpp
@@ -0,0 +1,118 @@
+#include <torch/library.h>
+
+#include "registration.h"
+#include "torch_binding.h"
+
+#include "new_cumsum.h"
+#include "new_histogram.h"
+#include "new_indices.h"
+#include "new_replicate.h"
+#include "new_sort.h"
+
+#include "grouped_gemm/grouped_gemm.h"
+
+// void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+torch::Tensor exclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out) {
+  megablocks::exclusive_cumsum(x, dim, out);
+  return out;
+}
+
+// void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+torch::Tensor inclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out) {
+  megablocks::inclusive_cumsum(x, dim, out);
+  return out;
+}
+
+// torch::Tensor histogram(torch::Tensor x, int num_bins);
+torch::Tensor histogram_wrapper(torch::Tensor x, int64_t num_bins) {
+  return megablocks::histogram(x, num_bins);
+}
+
+// void indices(torch::Tensor padded_bins,
+//   int block_size,
+//   int output_block_rows,
+//   int output_block_columns,
+//   torch::Tensor out);
+torch::Tensor indices_wrapper(torch::Tensor padded_bins,
+                               int64_t block_size,
+                               int64_t output_block_rows,
+                               int64_t output_block_columns,
+                               torch::Tensor out) {
+  megablocks::indices(padded_bins, block_size, output_block_rows, output_block_columns, out);
+  return out;
+}
+
+
+
+// Forward pass: replicate values from x according to bin sizes
+// void replicate_forward(torch::Tensor x,
+//   torch::Tensor bins,
+//   torch::Tensor out);
+torch::Tensor replicate_forward_wrapper(torch::Tensor x, torch::Tensor bins, torch::Tensor out) {
+  megablocks::replicate_forward(x, bins, out);
+  return out;
+}
+
+// // Backward pass: reduce gradients back to bins using segmented reduction
+// void replicate_backward(torch::Tensor grad,
+//    torch::Tensor bins,
+//    torch::Tensor out);
+torch::Tensor replicate_backward_wrapper(torch::Tensor grad, torch::Tensor bins, torch::Tensor out) {
+  megablocks::replicate_backward(grad, bins, out);
+  return out;
+}
+
+// // Public interface function for radix sorting with indices
+// void sort(torch::Tensor x,
+//   int end_bit,
+//   torch::Tensor x_out,
+//   torch::Tensor iota_out);
+torch::Tensor sort_wrapper(torch::Tensor x, int64_t end_bit, torch::Tensor x_out, torch::Tensor iota_out) {
+  megablocks::sort(x, end_bit, x_out, iota_out);
+  return x_out;
+}
+
+// GroupedGemm operation
+torch::Tensor gmm(torch::Tensor a, torch::Tensor b, torch::Tensor c, torch::Tensor batch_sizes, bool trans_a, bool trans_b) {
+  grouped_gemm::GroupedGemm(a, b, c, batch_sizes, trans_a, trans_b);
+  return c;
+}
+
+// Reference implementation:
+//
+// m.def("exclusive_cumsum", &exclusive_cumsum, "batched exclusive cumsum.");
+// m.def("histogram", &histogram, "even width histogram.");
+// m.def("inclusive_cumsum", &inclusive_cumsum, "batched inclusive cumsum");
+// m.def("indices", &indices, "indices construction for sparse matrix.");
+// m.def("replicate_forward", &replicate_forward, "(fwd) replicate a vector dynamically.");
+// m.def("replicate_backward", &replicate_backward, "(bwd) replicate a vector dynamically.");
+// m.def("sort", &sort, "key/value sort.");
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("exclusive_cumsum(Tensor x, int dim, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("exclusive_cumsum", torch::kCUDA, &exclusive_cumsum_wrapper);
+
+  ops.def("inclusive_cumsum(Tensor x, int dim, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("inclusive_cumsum", torch::kCUDA, &inclusive_cumsum_wrapper);
+
+  ops.def("histogram(Tensor x, int num_bins) -> Tensor");
+  ops.impl("histogram", torch::kCUDA, &histogram_wrapper);
+
+  ops.def("indices(Tensor padded_bins, int block_size, int output_block_rows, int output_block_columns, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("indices", torch::kCUDA, &indices_wrapper);
+
+  ops.def("replicate_forward(Tensor x, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("replicate_forward", torch::kCUDA, &replicate_forward_wrapper);
+
+  ops.def("replicate_backward(Tensor grad, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("replicate_backward", torch::kCUDA, &replicate_backward_wrapper);
+  
+  ops.def("sort(Tensor x, int end_bit, Tensor x_out, Tensor iota_out) -> Tensor(x_out)");
+  ops.impl("sort", torch::kCUDA, &sort_wrapper);
+
+  // Register the gmm GroupedGemm operation
+  ops.def("gmm(Tensor (a!) a, Tensor (b!) b, Tensor(c!) c, Tensor batch_sizes, bool trans_a, bool trans_b) -> Tensor(c!)");
+  ops.impl("gmm", torch::kCUDA, &gmm);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
\ No newline at end of file
diff --git a/torch-ext/torch_binding.h b/torch-ext/torch_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..32185bf463aa61183da91ea0dc93d874b1b0990b
--- /dev/null
+++ b/torch-ext/torch_binding.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <torch/torch.h>
+
+torch::Tensor exclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out);
+// torch::Tensor inclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out);
\ No newline at end of file